From 60f270753960291895cdd07d360c4e09c56c4596 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Mon, 1 Nov 2021 15:43:54 -0700
Subject: bpftool: Migrate -1 err checks of libbpf fn calls

Per [0], callers of libbpf functions with LIBBPF_STRICT_DIRECT_ERRS set
should handle negative error codes of various values (e.g. -EINVAL).
Migrate two callsites which were explicitly checking for -1 only to
handle the new scheme.

  [0]: https://github.com/libbpf/libbpf/wiki/Libbpf-1.0-migration-guide#direct-error-code-returning-libbpf_strict_direct_errs

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Quentin Monnet <quentin@isovalent.com>
Link: https://lore.kernel.org/bpf/20211101224357.2651181-2-davemarchevsky@fb.com
---
 tools/bpf/bpftool/btf_dumper.c | 2 +-
 tools/bpf/bpftool/struct_ops.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c
index 9c25286a5c73..6934e8634b94 100644
--- a/tools/bpf/bpftool/btf_dumper.c
+++ b/tools/bpf/bpftool/btf_dumper.c
@@ -52,7 +52,7 @@ static int dump_prog_id_as_func_ptr(const struct btf_dumper *d,
 
 	/* Get the bpf_prog's name.  Obtain from func_info. */
 	prog_fd = bpf_prog_get_fd_by_id(prog_id);
-	if (prog_fd == -1)
+	if (prog_fd < 0)
 		goto print;
 
 	prog_info = bpf_program__get_prog_info_linear(prog_fd,
diff --git a/tools/bpf/bpftool/struct_ops.c b/tools/bpf/bpftool/struct_ops.c
index ab2d2290569a..20f803dce2e4 100644
--- a/tools/bpf/bpftool/struct_ops.c
+++ b/tools/bpf/bpftool/struct_ops.c
@@ -252,7 +252,7 @@ static struct res do_one_id(const char *id_str, work_func func, void *data,
 	}
 
 	fd = bpf_map_get_fd_by_id(id);
-	if (fd == -1) {
+	if (fd < 0) {
 		p_err("can't get map by id (%lu): %s", id, strerror(errno));
 		res.nr_errs++;
 		return res;
-- 
cgit v1.2.3


From c59765cfd193382b00454b1a4424cb78d4c065e2 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Mon, 1 Nov 2021 15:43:55 -0700
Subject: bpftool: Use bpf_obj_get_info_by_fd directly

To prepare for impending deprecation of libbpf's
bpf_program__get_prog_info_linear, migrate uses of this function to use
bpf_obj_get_info_by_fd.

Since the profile_target_name and dump_prog_id_as_func_ptr helpers were
only looking at the first func_info, avoid grabbing the rest to save a
malloc. For do_dump, add a more full-featured helper, but avoid
free/realloc of buffer when possible for multi-prog dumps.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Quentin Monnet <quentin@isovalent.com>
Link: https://lore.kernel.org/bpf/20211101224357.2651181-3-davemarchevsky@fb.com
---
 tools/bpf/bpftool/btf_dumper.c |  40 ++++++-----
 tools/bpf/bpftool/prog.c       | 159 ++++++++++++++++++++++++++++++++---------
 2 files changed, 149 insertions(+), 50 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c
index 6934e8634b94..f5dddf8ef404 100644
--- a/tools/bpf/bpftool/btf_dumper.c
+++ b/tools/bpf/bpftool/btf_dumper.c
@@ -32,14 +32,16 @@ static int dump_prog_id_as_func_ptr(const struct btf_dumper *d,
 				    const struct btf_type *func_proto,
 				    __u32 prog_id)
 {
-	struct bpf_prog_info_linear *prog_info = NULL;
 	const struct btf_type *func_type;
+	int prog_fd = -1, func_sig_len;
+	struct bpf_prog_info info = {};
+	__u32 info_len = sizeof(info);
 	const char *prog_name = NULL;
-	struct bpf_func_info *finfo;
 	struct btf *prog_btf = NULL;
-	struct bpf_prog_info *info;
-	int prog_fd, func_sig_len;
+	struct bpf_func_info finfo;
+	__u32 finfo_rec_size;
 	char prog_str[1024];
+	int err;
 
 	/* Get the ptr's func_proto */
 	func_sig_len = btf_dump_func(d->btf, prog_str, func_proto, NULL, 0,
@@ -55,22 +57,27 @@ static int dump_prog_id_as_func_ptr(const struct btf_dumper *d,
 	if (prog_fd < 0)
 		goto print;
 
-	prog_info = bpf_program__get_prog_info_linear(prog_fd,
-						1UL << BPF_PROG_INFO_FUNC_INFO);
-	close(prog_fd);
-	if (IS_ERR(prog_info)) {
-		prog_info = NULL;
+	err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
+	if (err)
 		goto print;
-	}
-	info = &prog_info->info;
 
-	if (!info->btf_id || !info->nr_func_info)
+	if (!info.btf_id || !info.nr_func_info)
+		goto print;
+
+	finfo_rec_size = info.func_info_rec_size;
+	memset(&info, 0, sizeof(info));
+	info.nr_func_info = 1;
+	info.func_info_rec_size = finfo_rec_size;
+	info.func_info = ptr_to_u64(&finfo);
+
+	err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
+	if (err)
 		goto print;
-	prog_btf = btf__load_from_kernel_by_id(info->btf_id);
+
+	prog_btf = btf__load_from_kernel_by_id(info.btf_id);
 	if (libbpf_get_error(prog_btf))
 		goto print;
-	finfo = u64_to_ptr(info->func_info);
-	func_type = btf__type_by_id(prog_btf, finfo->type_id);
+	func_type = btf__type_by_id(prog_btf, finfo.type_id);
 	if (!func_type || !btf_is_func(func_type))
 		goto print;
 
@@ -92,7 +99,8 @@ print:
 	prog_str[sizeof(prog_str) - 1] = '\0';
 	jsonw_string(d->jw, prog_str);
 	btf__free(prog_btf);
-	free(prog_info);
+	if (prog_fd >= 0)
+		close(prog_fd);
 	return 0;
 }
 
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 515d22952602..dea7a49ec26e 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -100,6 +100,76 @@ static enum bpf_attach_type parse_attach_type(const char *str)
 	return __MAX_BPF_ATTACH_TYPE;
 }
 
+static int prep_prog_info(struct bpf_prog_info *const info, enum dump_mode mode,
+			  void **info_data, size_t *const info_data_sz)
+{
+	struct bpf_prog_info holder = {};
+	size_t needed = 0;
+	void *ptr;
+
+	if (mode == DUMP_JITED) {
+		holder.jited_prog_len = info->jited_prog_len;
+		needed += info->jited_prog_len;
+	} else {
+		holder.xlated_prog_len = info->xlated_prog_len;
+		needed += info->xlated_prog_len;
+	}
+
+	holder.nr_jited_ksyms = info->nr_jited_ksyms;
+	needed += info->nr_jited_ksyms * sizeof(__u64);
+
+	holder.nr_jited_func_lens = info->nr_jited_func_lens;
+	needed += info->nr_jited_func_lens * sizeof(__u32);
+
+	holder.nr_func_info = info->nr_func_info;
+	holder.func_info_rec_size = info->func_info_rec_size;
+	needed += info->nr_func_info * info->func_info_rec_size;
+
+	holder.nr_line_info = info->nr_line_info;
+	holder.line_info_rec_size = info->line_info_rec_size;
+	needed += info->nr_line_info * info->line_info_rec_size;
+
+	holder.nr_jited_line_info = info->nr_jited_line_info;
+	holder.jited_line_info_rec_size = info->jited_line_info_rec_size;
+	needed += info->nr_jited_line_info * info->jited_line_info_rec_size;
+
+	if (needed > *info_data_sz) {
+		ptr = realloc(*info_data, needed);
+		if (!ptr)
+			return -1;
+
+		*info_data = ptr;
+		*info_data_sz = needed;
+	}
+	ptr = *info_data;
+
+	if (mode == DUMP_JITED) {
+		holder.jited_prog_insns = ptr_to_u64(ptr);
+		ptr += holder.jited_prog_len;
+	} else {
+		holder.xlated_prog_insns = ptr_to_u64(ptr);
+		ptr += holder.xlated_prog_len;
+	}
+
+	holder.jited_ksyms = ptr_to_u64(ptr);
+	ptr += holder.nr_jited_ksyms * sizeof(__u64);
+
+	holder.jited_func_lens = ptr_to_u64(ptr);
+	ptr += holder.nr_jited_func_lens * sizeof(__u32);
+
+	holder.func_info = ptr_to_u64(ptr);
+	ptr += holder.nr_func_info * holder.func_info_rec_size;
+
+	holder.line_info = ptr_to_u64(ptr);
+	ptr += holder.nr_line_info * holder.line_info_rec_size;
+
+	holder.jited_line_info = ptr_to_u64(ptr);
+	ptr += holder.nr_jited_line_info * holder.jited_line_info_rec_size;
+
+	*info = holder;
+	return 0;
+}
+
 static void print_boot_time(__u64 nsecs, char *buf, unsigned int size)
 {
 	struct timespec real_time_ts, boot_time_ts;
@@ -803,16 +873,18 @@ prog_dump(struct bpf_prog_info *info, enum dump_mode mode,
 
 static int do_dump(int argc, char **argv)
 {
-	struct bpf_prog_info_linear *info_linear;
+	struct bpf_prog_info info;
+	__u32 info_len = sizeof(info);
+	size_t info_data_sz = 0;
+	void *info_data = NULL;
 	char *filepath = NULL;
 	bool opcodes = false;
 	bool visual = false;
 	enum dump_mode mode;
 	bool linum = false;
-	int *fds = NULL;
 	int nb_fds, i = 0;
+	int *fds = NULL;
 	int err = -1;
-	__u64 arrays;
 
 	if (is_prefix(*argv, "jited")) {
 		if (disasm_init())
@@ -872,43 +944,44 @@ static int do_dump(int argc, char **argv)
 		goto exit_close;
 	}
 
-	if (mode == DUMP_JITED)
-		arrays = 1UL << BPF_PROG_INFO_JITED_INSNS;
-	else
-		arrays = 1UL << BPF_PROG_INFO_XLATED_INSNS;
-
-	arrays |= 1UL << BPF_PROG_INFO_JITED_KSYMS;
-	arrays |= 1UL << BPF_PROG_INFO_JITED_FUNC_LENS;
-	arrays |= 1UL << BPF_PROG_INFO_FUNC_INFO;
-	arrays |= 1UL << BPF_PROG_INFO_LINE_INFO;
-	arrays |= 1UL << BPF_PROG_INFO_JITED_LINE_INFO;
-
 	if (json_output && nb_fds > 1)
 		jsonw_start_array(json_wtr);	/* root array */
 	for (i = 0; i < nb_fds; i++) {
-		info_linear = bpf_program__get_prog_info_linear(fds[i], arrays);
-		if (IS_ERR_OR_NULL(info_linear)) {
+		memset(&info, 0, sizeof(info));
+
+		err = bpf_obj_get_info_by_fd(fds[i], &info, &info_len);
+		if (err) {
+			p_err("can't get prog info: %s", strerror(errno));
+			break;
+		}
+
+		err = prep_prog_info(&info, mode, &info_data, &info_data_sz);
+		if (err) {
+			p_err("can't grow prog info_data");
+			break;
+		}
+
+		err = bpf_obj_get_info_by_fd(fds[i], &info, &info_len);
+		if (err) {
 			p_err("can't get prog info: %s", strerror(errno));
 			break;
 		}
 
 		if (json_output && nb_fds > 1) {
 			jsonw_start_object(json_wtr);	/* prog object */
-			print_prog_header_json(&info_linear->info);
+			print_prog_header_json(&info);
 			jsonw_name(json_wtr, "insns");
 		} else if (nb_fds > 1) {
-			print_prog_header_plain(&info_linear->info);
+			print_prog_header_plain(&info);
 		}
 
-		err = prog_dump(&info_linear->info, mode, filepath, opcodes,
-				visual, linum);
+		err = prog_dump(&info, mode, filepath, opcodes, visual, linum);
 
 		if (json_output && nb_fds > 1)
 			jsonw_end_object(json_wtr);	/* prog object */
 		else if (i != nb_fds - 1 && nb_fds > 1)
 			printf("\n");
 
-		free(info_linear);
 		if (err)
 			break;
 		close(fds[i]);
@@ -920,6 +993,7 @@ exit_close:
 	for (; i < nb_fds; i++)
 		close(fds[i]);
 exit_free:
+	free(info_data);
 	free(fds);
 	return err;
 }
@@ -2016,41 +2090,58 @@ static void profile_print_readings(void)
 
 static char *profile_target_name(int tgt_fd)
 {
-	struct bpf_prog_info_linear *info_linear;
-	struct bpf_func_info *func_info;
+	struct bpf_func_info func_info;
+	struct bpf_prog_info info = {};
+	__u32 info_len = sizeof(info);
 	const struct btf_type *t;
+	__u32 func_info_rec_size;
 	struct btf *btf = NULL;
 	char *name = NULL;
+	int err;
 
-	info_linear = bpf_program__get_prog_info_linear(
-		tgt_fd, 1UL << BPF_PROG_INFO_FUNC_INFO);
-	if (IS_ERR_OR_NULL(info_linear)) {
-		p_err("failed to get info_linear for prog FD %d", tgt_fd);
-		return NULL;
+	err = bpf_obj_get_info_by_fd(tgt_fd, &info, &info_len);
+	if (err) {
+		p_err("failed to bpf_obj_get_info_by_fd for prog FD %d", tgt_fd);
+		goto out;
 	}
 
-	if (info_linear->info.btf_id == 0) {
+	if (info.btf_id == 0) {
 		p_err("prog FD %d doesn't have valid btf", tgt_fd);
 		goto out;
 	}
 
-	btf = btf__load_from_kernel_by_id(info_linear->info.btf_id);
+	func_info_rec_size = info.func_info_rec_size;
+	if (info.nr_func_info == 0) {
+		p_err("bpf_obj_get_info_by_fd for prog FD %d found 0 func_info", tgt_fd);
+		goto out;
+	}
+
+	memset(&info, 0, sizeof(info));
+	info.nr_func_info = 1;
+	info.func_info_rec_size = func_info_rec_size;
+	info.func_info = ptr_to_u64(&func_info);
+
+	err = bpf_obj_get_info_by_fd(tgt_fd, &info, &info_len);
+	if (err) {
+		p_err("failed to get func_info for prog FD %d", tgt_fd);
+		goto out;
+	}
+
+	btf = btf__load_from_kernel_by_id(info.btf_id);
 	if (libbpf_get_error(btf)) {
 		p_err("failed to load btf for prog FD %d", tgt_fd);
 		goto out;
 	}
 
-	func_info = u64_to_ptr(info_linear->info.func_info);
-	t = btf__type_by_id(btf, func_info[0].type_id);
+	t = btf__type_by_id(btf, func_info.type_id);
 	if (!t) {
 		p_err("btf %d doesn't have type %d",
-		      info_linear->info.btf_id, func_info[0].type_id);
+		      info.btf_id, func_info.type_id);
 		goto out;
 	}
 	name = strdup(btf__name_by_offset(btf, t->name_off));
 out:
 	btf__free(btf);
-	free(info_linear);
 	return name;
 }
 
-- 
cgit v1.2.3


From 199e06fe832ddca80c2167661acab0e9dec657c4 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Mon, 1 Nov 2021 15:43:56 -0700
Subject: perf: Pull in bpf_program__get_prog_info_linear

To prepare for impending deprecation of libbpf's
bpf_program__get_prog_info_linear, pull in the function and associated
helpers into the perf codebase and migrate existing uses to the perf
copy.

Since libbpf's deprecated definitions will still be visible to perf, it
is necessary to rename perf's definitions.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: Quentin Monnet <quentin@isovalent.com>
Link: https://lore.kernel.org/bpf/20211101224357.2651181-4-davemarchevsky@fb.com
---
 tools/perf/Documentation/perf.data-file-format.txt |   2 +-
 tools/perf/util/Build                              |   1 +
 tools/perf/util/annotate.c                         |   3 +-
 tools/perf/util/bpf-event.c                        |  41 ++--
 tools/perf/util/bpf-event.h                        |   2 +-
 tools/perf/util/bpf-utils.c                        | 261 +++++++++++++++++++++
 tools/perf/util/bpf-utils.h                        |  76 ++++++
 tools/perf/util/bpf_counter.c                      |   6 +-
 tools/perf/util/dso.c                              |   1 +
 tools/perf/util/env.c                              |   1 +
 tools/perf/util/header.c                           |  13 +-
 11 files changed, 374 insertions(+), 33 deletions(-)
 create mode 100644 tools/perf/util/bpf-utils.c
 create mode 100644 tools/perf/util/bpf-utils.h

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf.data-file-format.txt b/tools/perf/Documentation/perf.data-file-format.txt
index e6ff8c898ada..f56d0e0fbff6 100644
--- a/tools/perf/Documentation/perf.data-file-format.txt
+++ b/tools/perf/Documentation/perf.data-file-format.txt
@@ -346,7 +346,7 @@ to special needs.
 
         HEADER_BPF_PROG_INFO = 25,
 
-struct bpf_prog_info_linear, which contains detailed information about
+struct perf_bpil, which contains detailed information about
 a BPF program, including type, id, tag, jited/xlated instructions, etc.
 
         HEADER_BPF_BTF = 26,
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index f2914d5bed6e..ee42da1d3639 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -201,6 +201,7 @@ endif
 perf-y += perf-hooks.o
 
 perf-$(CONFIG_LIBBPF) += bpf-event.o
+perf-$(CONFIG_LIBBPF) += bpf-utils.o
 
 perf-$(CONFIG_CXX) += c++/
 
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 0bae061b2d6d..f0e5a236b7e3 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -28,6 +28,7 @@
 #include "evsel.h"
 #include "evlist.h"
 #include "bpf-event.h"
+#include "bpf-utils.h"
 #include "block-range.h"
 #include "string2.h"
 #include "util/event.h"
@@ -1700,12 +1701,12 @@ static int symbol__disassemble_bpf(struct symbol *sym,
 {
 	struct annotation *notes = symbol__annotation(sym);
 	struct annotation_options *opts = args->options;
-	struct bpf_prog_info_linear *info_linear;
 	struct bpf_prog_linfo *prog_linfo = NULL;
 	struct bpf_prog_info_node *info_node;
 	int len = sym->end - sym->start;
 	disassembler_ftype disassemble;
 	struct map *map = args->ms.map;
+	struct perf_bpil *info_linear;
 	struct disassemble_info info;
 	struct dso *dso = map->dso;
 	int pc = 0, count, sub_id;
diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c
index 388847bab6d9..a27badb0a53a 100644
--- a/tools/perf/util/bpf-event.c
+++ b/tools/perf/util/bpf-event.c
@@ -10,6 +10,7 @@
 #include <internal/lib.h>
 #include <symbol/kallsyms.h>
 #include "bpf-event.h"
+#include "bpf-utils.h"
 #include "debug.h"
 #include "dso.h"
 #include "symbol.h"
@@ -32,8 +33,6 @@ struct btf * __weak btf__load_from_kernel_by_id(__u32 id)
        return err ? ERR_PTR(err) : btf;
 }
 
-#define ptr_to_u64(ptr)    ((__u64)(unsigned long)(ptr))
-
 static int snprintf_hex(char *buf, size_t size, unsigned char *data, size_t len)
 {
 	int ret = 0;
@@ -48,9 +47,9 @@ static int machine__process_bpf_event_load(struct machine *machine,
 					   union perf_event *event,
 					   struct perf_sample *sample __maybe_unused)
 {
-	struct bpf_prog_info_linear *info_linear;
 	struct bpf_prog_info_node *info_node;
 	struct perf_env *env = machine->env;
+	struct perf_bpil *info_linear;
 	int id = event->bpf.id;
 	unsigned int i;
 
@@ -175,9 +174,9 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_session *session,
 {
 	struct perf_record_ksymbol *ksymbol_event = &event->ksymbol;
 	struct perf_record_bpf_event *bpf_event = &event->bpf;
-	struct bpf_prog_info_linear *info_linear;
 	struct perf_tool *tool = session->tool;
 	struct bpf_prog_info_node *info_node;
+	struct perf_bpil *info_linear;
 	struct bpf_prog_info *info;
 	struct btf *btf = NULL;
 	struct perf_env *env;
@@ -191,15 +190,15 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_session *session,
 	 */
 	env = session->data ? &session->header.env : &perf_env;
 
-	arrays = 1UL << BPF_PROG_INFO_JITED_KSYMS;
-	arrays |= 1UL << BPF_PROG_INFO_JITED_FUNC_LENS;
-	arrays |= 1UL << BPF_PROG_INFO_FUNC_INFO;
-	arrays |= 1UL << BPF_PROG_INFO_PROG_TAGS;
-	arrays |= 1UL << BPF_PROG_INFO_JITED_INSNS;
-	arrays |= 1UL << BPF_PROG_INFO_LINE_INFO;
-	arrays |= 1UL << BPF_PROG_INFO_JITED_LINE_INFO;
+	arrays = 1UL << PERF_BPIL_JITED_KSYMS;
+	arrays |= 1UL << PERF_BPIL_JITED_FUNC_LENS;
+	arrays |= 1UL << PERF_BPIL_FUNC_INFO;
+	arrays |= 1UL << PERF_BPIL_PROG_TAGS;
+	arrays |= 1UL << PERF_BPIL_JITED_INSNS;
+	arrays |= 1UL << PERF_BPIL_LINE_INFO;
+	arrays |= 1UL << PERF_BPIL_JITED_LINE_INFO;
 
-	info_linear = bpf_program__get_prog_info_linear(fd, arrays);
+	info_linear = get_bpf_prog_info_linear(fd, arrays);
 	if (IS_ERR_OR_NULL(info_linear)) {
 		info_linear = NULL;
 		pr_debug("%s: failed to get BPF program info. aborting\n", __func__);
@@ -452,8 +451,8 @@ int perf_event__synthesize_bpf_events(struct perf_session *session,
 
 static void perf_env__add_bpf_info(struct perf_env *env, u32 id)
 {
-	struct bpf_prog_info_linear *info_linear;
 	struct bpf_prog_info_node *info_node;
+	struct perf_bpil *info_linear;
 	struct btf *btf = NULL;
 	u64 arrays;
 	u32 btf_id;
@@ -463,15 +462,15 @@ static void perf_env__add_bpf_info(struct perf_env *env, u32 id)
 	if (fd < 0)
 		return;
 
-	arrays = 1UL << BPF_PROG_INFO_JITED_KSYMS;
-	arrays |= 1UL << BPF_PROG_INFO_JITED_FUNC_LENS;
-	arrays |= 1UL << BPF_PROG_INFO_FUNC_INFO;
-	arrays |= 1UL << BPF_PROG_INFO_PROG_TAGS;
-	arrays |= 1UL << BPF_PROG_INFO_JITED_INSNS;
-	arrays |= 1UL << BPF_PROG_INFO_LINE_INFO;
-	arrays |= 1UL << BPF_PROG_INFO_JITED_LINE_INFO;
+	arrays = 1UL << PERF_BPIL_JITED_KSYMS;
+	arrays |= 1UL << PERF_BPIL_JITED_FUNC_LENS;
+	arrays |= 1UL << PERF_BPIL_FUNC_INFO;
+	arrays |= 1UL << PERF_BPIL_PROG_TAGS;
+	arrays |= 1UL << PERF_BPIL_JITED_INSNS;
+	arrays |= 1UL << PERF_BPIL_LINE_INFO;
+	arrays |= 1UL << PERF_BPIL_JITED_LINE_INFO;
 
-	info_linear = bpf_program__get_prog_info_linear(fd, arrays);
+	info_linear = get_bpf_prog_info_linear(fd, arrays);
 	if (IS_ERR_OR_NULL(info_linear)) {
 		pr_debug("%s: failed to get BPF program info. aborting\n", __func__);
 		goto out;
diff --git a/tools/perf/util/bpf-event.h b/tools/perf/util/bpf-event.h
index 68f315c3df5b..144a8a24cc69 100644
--- a/tools/perf/util/bpf-event.h
+++ b/tools/perf/util/bpf-event.h
@@ -19,7 +19,7 @@ struct evlist;
 struct target;
 
 struct bpf_prog_info_node {
-	struct bpf_prog_info_linear	*info_linear;
+	struct perf_bpil		*info_linear;
 	struct rb_node			rb_node;
 };
 
diff --git a/tools/perf/util/bpf-utils.c b/tools/perf/util/bpf-utils.c
new file mode 100644
index 000000000000..e271e05e51bc
--- /dev/null
+++ b/tools/perf/util/bpf-utils.c
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <errno.h>
+#include <stdlib.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <bpf/bpf.h>
+#include "bpf-utils.h"
+#include "debug.h"
+
+struct bpil_array_desc {
+	int	array_offset;	/* e.g. offset of jited_prog_insns */
+	int	count_offset;	/* e.g. offset of jited_prog_len */
+	int	size_offset;	/* > 0: offset of rec size,
+				 * < 0: fix size of -size_offset
+				 */
+};
+
+static struct bpil_array_desc bpil_array_desc[] = {
+	[PERF_BPIL_JITED_INSNS] = {
+		offsetof(struct bpf_prog_info, jited_prog_insns),
+		offsetof(struct bpf_prog_info, jited_prog_len),
+		-1,
+	},
+	[PERF_BPIL_XLATED_INSNS] = {
+		offsetof(struct bpf_prog_info, xlated_prog_insns),
+		offsetof(struct bpf_prog_info, xlated_prog_len),
+		-1,
+	},
+	[PERF_BPIL_MAP_IDS] = {
+		offsetof(struct bpf_prog_info, map_ids),
+		offsetof(struct bpf_prog_info, nr_map_ids),
+		-(int)sizeof(__u32),
+	},
+	[PERF_BPIL_JITED_KSYMS] = {
+		offsetof(struct bpf_prog_info, jited_ksyms),
+		offsetof(struct bpf_prog_info, nr_jited_ksyms),
+		-(int)sizeof(__u64),
+	},
+	[PERF_BPIL_JITED_FUNC_LENS] = {
+		offsetof(struct bpf_prog_info, jited_func_lens),
+		offsetof(struct bpf_prog_info, nr_jited_func_lens),
+		-(int)sizeof(__u32),
+	},
+	[PERF_BPIL_FUNC_INFO] = {
+		offsetof(struct bpf_prog_info, func_info),
+		offsetof(struct bpf_prog_info, nr_func_info),
+		offsetof(struct bpf_prog_info, func_info_rec_size),
+	},
+	[PERF_BPIL_LINE_INFO] = {
+		offsetof(struct bpf_prog_info, line_info),
+		offsetof(struct bpf_prog_info, nr_line_info),
+		offsetof(struct bpf_prog_info, line_info_rec_size),
+	},
+	[PERF_BPIL_JITED_LINE_INFO] = {
+		offsetof(struct bpf_prog_info, jited_line_info),
+		offsetof(struct bpf_prog_info, nr_jited_line_info),
+		offsetof(struct bpf_prog_info, jited_line_info_rec_size),
+	},
+	[PERF_BPIL_PROG_TAGS] = {
+		offsetof(struct bpf_prog_info, prog_tags),
+		offsetof(struct bpf_prog_info, nr_prog_tags),
+		-(int)sizeof(__u8) * BPF_TAG_SIZE,
+	},
+
+};
+
+static __u32 bpf_prog_info_read_offset_u32(struct bpf_prog_info *info,
+					   int offset)
+{
+	__u32 *array = (__u32 *)info;
+
+	if (offset >= 0)
+		return array[offset / sizeof(__u32)];
+	return -(int)offset;
+}
+
+static __u64 bpf_prog_info_read_offset_u64(struct bpf_prog_info *info,
+					   int offset)
+{
+	__u64 *array = (__u64 *)info;
+
+	if (offset >= 0)
+		return array[offset / sizeof(__u64)];
+	return -(int)offset;
+}
+
+static void bpf_prog_info_set_offset_u32(struct bpf_prog_info *info, int offset,
+					 __u32 val)
+{
+	__u32 *array = (__u32 *)info;
+
+	if (offset >= 0)
+		array[offset / sizeof(__u32)] = val;
+}
+
+static void bpf_prog_info_set_offset_u64(struct bpf_prog_info *info, int offset,
+					 __u64 val)
+{
+	__u64 *array = (__u64 *)info;
+
+	if (offset >= 0)
+		array[offset / sizeof(__u64)] = val;
+}
+
+struct perf_bpil *
+get_bpf_prog_info_linear(int fd, __u64 arrays)
+{
+	struct bpf_prog_info info = {};
+	struct perf_bpil *info_linear;
+	__u32 info_len = sizeof(info);
+	__u32 data_len = 0;
+	int i, err;
+	void *ptr;
+
+	if (arrays >> PERF_BPIL_LAST_ARRAY)
+		return ERR_PTR(-EINVAL);
+
+	/* step 1: get array dimensions */
+	err = bpf_obj_get_info_by_fd(fd, &info, &info_len);
+	if (err) {
+		pr_debug("can't get prog info: %s", strerror(errno));
+		return ERR_PTR(-EFAULT);
+	}
+
+	/* step 2: calculate total size of all arrays */
+	for (i = PERF_BPIL_FIRST_ARRAY; i < PERF_BPIL_LAST_ARRAY; ++i) {
+		bool include_array = (arrays & (1UL << i)) > 0;
+		struct bpil_array_desc *desc;
+		__u32 count, size;
+
+		desc = bpil_array_desc + i;
+
+		/* kernel is too old to support this field */
+		if (info_len < desc->array_offset + sizeof(__u32) ||
+		    info_len < desc->count_offset + sizeof(__u32) ||
+		    (desc->size_offset > 0 && info_len < (__u32)desc->size_offset))
+			include_array = false;
+
+		if (!include_array) {
+			arrays &= ~(1UL << i);	/* clear the bit */
+			continue;
+		}
+
+		count = bpf_prog_info_read_offset_u32(&info, desc->count_offset);
+		size  = bpf_prog_info_read_offset_u32(&info, desc->size_offset);
+
+		data_len += count * size;
+	}
+
+	/* step 3: allocate continuous memory */
+	data_len = roundup(data_len, sizeof(__u64));
+	info_linear = malloc(sizeof(struct perf_bpil) + data_len);
+	if (!info_linear)
+		return ERR_PTR(-ENOMEM);
+
+	/* step 4: fill data to info_linear->info */
+	info_linear->arrays = arrays;
+	memset(&info_linear->info, 0, sizeof(info));
+	ptr = info_linear->data;
+
+	for (i = PERF_BPIL_FIRST_ARRAY; i < PERF_BPIL_LAST_ARRAY; ++i) {
+		struct bpil_array_desc *desc;
+		__u32 count, size;
+
+		if ((arrays & (1UL << i)) == 0)
+			continue;
+
+		desc  = bpil_array_desc + i;
+		count = bpf_prog_info_read_offset_u32(&info, desc->count_offset);
+		size  = bpf_prog_info_read_offset_u32(&info, desc->size_offset);
+		bpf_prog_info_set_offset_u32(&info_linear->info,
+					     desc->count_offset, count);
+		bpf_prog_info_set_offset_u32(&info_linear->info,
+					     desc->size_offset, size);
+		bpf_prog_info_set_offset_u64(&info_linear->info,
+					     desc->array_offset,
+					     ptr_to_u64(ptr));
+		ptr += count * size;
+	}
+
+	/* step 5: call syscall again to get required arrays */
+	err = bpf_obj_get_info_by_fd(fd, &info_linear->info, &info_len);
+	if (err) {
+		pr_debug("can't get prog info: %s", strerror(errno));
+		free(info_linear);
+		return ERR_PTR(-EFAULT);
+	}
+
+	/* step 6: verify the data */
+	for (i = PERF_BPIL_FIRST_ARRAY; i < PERF_BPIL_LAST_ARRAY; ++i) {
+		struct bpil_array_desc *desc;
+		__u32 v1, v2;
+
+		if ((arrays & (1UL << i)) == 0)
+			continue;
+
+		desc = bpil_array_desc + i;
+		v1 = bpf_prog_info_read_offset_u32(&info, desc->count_offset);
+		v2 = bpf_prog_info_read_offset_u32(&info_linear->info,
+						   desc->count_offset);
+		if (v1 != v2)
+			pr_warning("%s: mismatch in element count\n", __func__);
+
+		v1 = bpf_prog_info_read_offset_u32(&info, desc->size_offset);
+		v2 = bpf_prog_info_read_offset_u32(&info_linear->info,
+						   desc->size_offset);
+		if (v1 != v2)
+			pr_warning("%s: mismatch in rec size\n", __func__);
+	}
+
+	/* step 7: update info_len and data_len */
+	info_linear->info_len = sizeof(struct bpf_prog_info);
+	info_linear->data_len = data_len;
+
+	return info_linear;
+}
+
+void bpil_addr_to_offs(struct perf_bpil *info_linear)
+{
+	int i;
+
+	for (i = PERF_BPIL_FIRST_ARRAY; i < PERF_BPIL_LAST_ARRAY; ++i) {
+		struct bpil_array_desc *desc;
+		__u64 addr, offs;
+
+		if ((info_linear->arrays & (1UL << i)) == 0)
+			continue;
+
+		desc = bpil_array_desc + i;
+		addr = bpf_prog_info_read_offset_u64(&info_linear->info,
+						     desc->array_offset);
+		offs = addr - ptr_to_u64(info_linear->data);
+		bpf_prog_info_set_offset_u64(&info_linear->info,
+					     desc->array_offset, offs);
+	}
+}
+
+void bpil_offs_to_addr(struct perf_bpil *info_linear)
+{
+	int i;
+
+	for (i = PERF_BPIL_FIRST_ARRAY; i < PERF_BPIL_LAST_ARRAY; ++i) {
+		struct bpil_array_desc *desc;
+		__u64 addr, offs;
+
+		if ((info_linear->arrays & (1UL << i)) == 0)
+			continue;
+
+		desc = bpil_array_desc + i;
+		offs = bpf_prog_info_read_offset_u64(&info_linear->info,
+						     desc->array_offset);
+		addr = offs + ptr_to_u64(info_linear->data);
+		bpf_prog_info_set_offset_u64(&info_linear->info,
+					     desc->array_offset, addr);
+	}
+}
diff --git a/tools/perf/util/bpf-utils.h b/tools/perf/util/bpf-utils.h
new file mode 100644
index 000000000000..86a5055cdfad
--- /dev/null
+++ b/tools/perf/util/bpf-utils.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+
+#ifndef __PERF_BPF_UTILS_H
+#define __PERF_BPF_UTILS_H
+
+#define ptr_to_u64(ptr)    ((__u64)(unsigned long)(ptr))
+
+#ifdef HAVE_LIBBPF_SUPPORT
+
+#include <bpf/libbpf.h>
+
+/*
+ * Get bpf_prog_info in continuous memory
+ *
+ * struct bpf_prog_info has multiple arrays. The user has option to choose
+ * arrays to fetch from kernel. The following APIs provide an uniform way to
+ * fetch these data. All arrays in bpf_prog_info are stored in a single
+ * continuous memory region. This makes it easy to store the info in a
+ * file.
+ *
+ * Before writing perf_bpil to files, it is necessary to
+ * translate pointers in bpf_prog_info to offsets. Helper functions
+ * bpil_addr_to_offs() and bpil_offs_to_addr()
+ * are introduced to switch between pointers and offsets.
+ *
+ * Examples:
+ *   # To fetch map_ids and prog_tags:
+ *   __u64 arrays = (1UL << PERF_BPIL_MAP_IDS) |
+ *           (1UL << PERF_BPIL_PROG_TAGS);
+ *   struct perf_bpil *info_linear =
+ *           get_bpf_prog_info_linear(fd, arrays);
+ *
+ *   # To save data in file
+ *   bpil_addr_to_offs(info_linear);
+ *   write(f, info_linear, sizeof(*info_linear) + info_linear->data_len);
+ *
+ *   # To read data from file
+ *   read(f, info_linear, <proper_size>);
+ *   bpil_offs_to_addr(info_linear);
+ */
+enum perf_bpil_array_types {
+	PERF_BPIL_FIRST_ARRAY = 0,
+	PERF_BPIL_JITED_INSNS = 0,
+	PERF_BPIL_XLATED_INSNS,
+	PERF_BPIL_MAP_IDS,
+	PERF_BPIL_JITED_KSYMS,
+	PERF_BPIL_JITED_FUNC_LENS,
+	PERF_BPIL_FUNC_INFO,
+	PERF_BPIL_LINE_INFO,
+	PERF_BPIL_JITED_LINE_INFO,
+	PERF_BPIL_PROG_TAGS,
+	PERF_BPIL_LAST_ARRAY,
+};
+
+struct perf_bpil {
+	/* size of struct bpf_prog_info, when the tool is compiled */
+	__u32			info_len;
+	/* total bytes allocated for data, round up to 8 bytes */
+	__u32			data_len;
+	/* which arrays are included in data */
+	__u64			arrays;
+	struct bpf_prog_info	info;
+	__u8			data[];
+};
+
+struct perf_bpil *
+get_bpf_prog_info_linear(int fd, __u64 arrays);
+
+void
+bpil_addr_to_offs(struct perf_bpil *info_linear);
+
+void
+bpil_offs_to_addr(struct perf_bpil *info_linear);
+
+#endif /* HAVE_LIBBPF_SUPPORT */
+#endif /* __PERF_BPF_UTILS_H */
diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c
index ba0f20853651..2b04df8c5f87 100644
--- a/tools/perf/util/bpf_counter.c
+++ b/tools/perf/util/bpf_counter.c
@@ -13,6 +13,7 @@
 #include <perf/bpf_perf.h>
 
 #include "bpf_counter.h"
+#include "bpf-utils.h"
 #include "counts.h"
 #include "debug.h"
 #include "evsel.h"
@@ -61,14 +62,13 @@ static int bpf_program_profiler__destroy(struct evsel *evsel)
 
 static char *bpf_target_prog_name(int tgt_fd)
 {
-	struct bpf_prog_info_linear *info_linear;
 	struct bpf_func_info *func_info;
+	struct perf_bpil *info_linear;
 	const struct btf_type *t;
 	struct btf *btf = NULL;
 	char *name = NULL;
 
-	info_linear = bpf_program__get_prog_info_linear(
-		tgt_fd, 1UL << BPF_PROG_INFO_FUNC_INFO);
+	info_linear = get_bpf_prog_info_linear(tgt_fd, 1UL << PERF_BPIL_FUNC_INFO);
 	if (IS_ERR_OR_NULL(info_linear)) {
 		pr_debug("failed to get info_linear for prog FD %d\n", tgt_fd);
 		return NULL;
diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index 9ed9a5676d35..9cc8a1772b4b 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -14,6 +14,7 @@
 #ifdef HAVE_LIBBPF_SUPPORT
 #include <bpf/libbpf.h>
 #include "bpf-event.h"
+#include "bpf-utils.h"
 #endif
 #include "compress.h"
 #include "env.h"
diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index cf773f0dec38..17f1dd0680b4 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -16,6 +16,7 @@ struct perf_env perf_env;
 
 #ifdef HAVE_LIBBPF_SUPPORT
 #include "bpf-event.h"
+#include "bpf-utils.h"
 #include <bpf/libbpf.h>
 
 void perf_env__insert_bpf_prog_info(struct perf_env *env,
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 1c7414f66655..56511db8fa03 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -48,6 +48,7 @@
 #include "util/util.h" // perf_exe()
 #include "cputopo.h"
 #include "bpf-event.h"
+#include "bpf-utils.h"
 #include "clockid.h"
 #include "pmu-hybrid.h"
 
@@ -1006,17 +1007,17 @@ static int write_bpf_prog_info(struct feat_fd *ff,
 
 		node = rb_entry(next, struct bpf_prog_info_node, rb_node);
 		next = rb_next(&node->rb_node);
-		len = sizeof(struct bpf_prog_info_linear) +
+		len = sizeof(struct perf_bpil) +
 			node->info_linear->data_len;
 
 		/* before writing to file, translate address to offset */
-		bpf_program__bpil_addr_to_offs(node->info_linear);
+		bpil_addr_to_offs(node->info_linear);
 		ret = do_write(ff, node->info_linear, len);
 		/*
 		 * translate back to address even when do_write() fails,
 		 * so that this function never changes the data.
 		 */
-		bpf_program__bpil_offs_to_addr(node->info_linear);
+		bpil_offs_to_addr(node->info_linear);
 		if (ret < 0)
 			goto out;
 	}
@@ -3018,9 +3019,9 @@ static int process_dir_format(struct feat_fd *ff,
 #ifdef HAVE_LIBBPF_SUPPORT
 static int process_bpf_prog_info(struct feat_fd *ff, void *data __maybe_unused)
 {
-	struct bpf_prog_info_linear *info_linear;
 	struct bpf_prog_info_node *info_node;
 	struct perf_env *env = &ff->ph->env;
+	struct perf_bpil *info_linear;
 	u32 count, i;
 	int err = -1;
 
@@ -3049,7 +3050,7 @@ static int process_bpf_prog_info(struct feat_fd *ff, void *data __maybe_unused)
 			goto out;
 		}
 
-		info_linear = malloc(sizeof(struct bpf_prog_info_linear) +
+		info_linear = malloc(sizeof(struct perf_bpil) +
 				     data_len);
 		if (!info_linear)
 			goto out;
@@ -3071,7 +3072,7 @@ static int process_bpf_prog_info(struct feat_fd *ff, void *data __maybe_unused)
 			goto out;
 
 		/* after reading from file, translate offset to address */
-		bpf_program__bpil_offs_to_addr(info_linear);
+		bpil_offs_to_addr(info_linear);
 		info_node->info_linear = info_linear;
 		perf_env__insert_bpf_prog_info(env, info_node);
 	}
-- 
cgit v1.2.3


From f5aafbc2af51931668799a9c5080c8e35cbb571f Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Mon, 1 Nov 2021 15:43:57 -0700
Subject: libbpf: Deprecate bpf_program__get_prog_info_linear

As part of the road to libbpf 1.0, and discussed in libbpf issue tracker
[0], bpf_program__get_prog_info_linear and its associated structs and
helper functions should be deprecated. The functionality is too specific
to the needs of 'perf', and there's little/no out-of-tree usage to
preclude introduction of a more general helper in the future.

  [0] Closes: https://github.com/libbpf/libbpf/issues/313

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211101224357.2651181-5-davemarchevsky@fb.com
---
 tools/lib/bpf/libbpf.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 9de0f299706b..797f5f8a0e20 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -918,12 +918,15 @@ struct bpf_prog_info_linear {
 	__u8			data[];
 };
 
+LIBBPF_DEPRECATED_SINCE(0, 6, "use a custom linear prog_info wrapper")
 LIBBPF_API struct bpf_prog_info_linear *
 bpf_program__get_prog_info_linear(int fd, __u64 arrays);
 
+LIBBPF_DEPRECATED_SINCE(0, 6, "use a custom linear prog_info wrapper")
 LIBBPF_API void
 bpf_program__bpil_addr_to_offs(struct bpf_prog_info_linear *info_linear);
 
+LIBBPF_DEPRECATED_SINCE(0, 6, "use a custom linear prog_info wrapper")
 LIBBPF_API void
 bpf_program__bpil_offs_to_addr(struct bpf_prog_info_linear *info_linear);
 
-- 
cgit v1.2.3


From 48d5fd06453de57f9d9385c7ebcdf3b50658a962 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 3 Nov 2021 09:30:38 -0700
Subject: selftests/seccomp: Stop USER_NOTIF test if kcmp() fails

If kcmp() fails during the USER_NOTIF test, the test is likely to hang,
so switch from EXPECT to ASSERT.

Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Will Drewry <wad@chromium.org>
Cc: linux-kselftest@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Link: https://lore.kernel.org/r/20211103163039.2104830-2-keescook@chromium.org
---
 tools/testing/selftests/seccomp/seccomp_bpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 1d64891e6492..d999643d577c 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -4087,7 +4087,7 @@ TEST(user_notification_addfd)
 	 * lowest available fd to be assigned here.
 	 */
 	EXPECT_EQ(fd, nextfd++);
-	EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
+	ASSERT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
 
 	/*
 	 * This sets the ID of the ADD FD to the last request plus 1. The
-- 
cgit v1.2.3


From 1e6d69c7b9cd7735bbf4c6754ccbb9cce8bd8ff4 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 3 Nov 2021 09:30:39 -0700
Subject: selftests/seccomp: Report event mismatches more clearly

When running under tracer, more explicitly report the status and event
mismatches to help with debugging. Additionally add an "immediate kill"
test when under tracing to verify that fatal SIGSYS behaves the same
under ptrace or seccomp tracing.

Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Will Drewry <wad@chromium.org>
Cc: linux-kselftest@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Link: https://lore.kernel.org/r/20211103163039.2104830-3-keescook@chromium.org
---
 tools/testing/selftests/seccomp/seccomp_bpf.c | 54 ++++++++++++++++++++++++---
 1 file changed, 49 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index d999643d577c..60b8d5899fe3 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -1487,7 +1487,7 @@ TEST_F(precedence, log_is_fifth_in_any_order)
 #define PTRACE_EVENT_SECCOMP 7
 #endif
 
-#define IS_SECCOMP_EVENT(status) ((status >> 16) == PTRACE_EVENT_SECCOMP)
+#define PTRACE_EVENT_MASK(status) ((status) >> 16)
 bool tracer_running;
 void tracer_stop(int sig)
 {
@@ -1539,12 +1539,22 @@ void start_tracer(struct __test_metadata *_metadata, int fd, pid_t tracee,
 
 		if (wait(&status) != tracee)
 			continue;
-		if (WIFSIGNALED(status) || WIFEXITED(status))
-			/* Child is dead. Time to go. */
+
+		if (WIFSIGNALED(status)) {
+			/* Child caught a fatal signal. */
+			return;
+		}
+		if (WIFEXITED(status)) {
+			/* Child exited with code. */
 			return;
+		}
 
-		/* Check if this is a seccomp event. */
-		ASSERT_EQ(!ptrace_syscall, IS_SECCOMP_EVENT(status));
+		/* Check if we got an expected event. */
+		ASSERT_EQ(WIFCONTINUED(status), false);
+		ASSERT_EQ(WIFSTOPPED(status), true);
+		ASSERT_EQ(WSTOPSIG(status) & SIGTRAP, SIGTRAP) {
+			TH_LOG("Unexpected WSTOPSIG: %d", WSTOPSIG(status));
+		}
 
 		tracer_func(_metadata, tracee, status, args);
 
@@ -1961,6 +1971,11 @@ void tracer_seccomp(struct __test_metadata *_metadata, pid_t tracee,
 	int ret;
 	unsigned long msg;
 
+	EXPECT_EQ(PTRACE_EVENT_MASK(status), PTRACE_EVENT_SECCOMP) {
+		TH_LOG("Unexpected ptrace event: %d", PTRACE_EVENT_MASK(status));
+		return;
+	}
+
 	/* Make sure we got the right message. */
 	ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
 	EXPECT_EQ(0, ret);
@@ -2011,6 +2026,11 @@ void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
 	long *syscall_nr = NULL, *syscall_ret = NULL;
 	FIXTURE_DATA(TRACE_syscall) *self = args;
 
+	EXPECT_EQ(WSTOPSIG(status) & 0x80, 0x80) {
+		TH_LOG("Unexpected WSTOPSIG: %d", WSTOPSIG(status));
+		return;
+	}
+
 	/*
 	 * The traditional way to tell PTRACE_SYSCALL entry/exit
 	 * is by counting.
@@ -2128,6 +2148,7 @@ FIXTURE_SETUP(TRACE_syscall)
 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
 	ASSERT_EQ(0, ret);
 
+	/* Do not install seccomp rewrite filters, as we'll use ptrace instead. */
 	if (variant->use_ptrace)
 		return;
 
@@ -2186,6 +2207,29 @@ TEST_F(TRACE_syscall, syscall_faked)
 	EXPECT_SYSCALL_RETURN(45000, syscall(__NR_gettid));
 }
 
+TEST_F_SIGNAL(TRACE_syscall, kill_immediate, SIGSYS)
+{
+	struct sock_filter filter[] = {
+		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+			offsetof(struct seccomp_data, nr)),
+		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_mknodat, 0, 1),
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_THREAD),
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+	};
+	struct sock_fprog prog = {
+		.len = (unsigned short)ARRAY_SIZE(filter),
+		.filter = filter,
+	};
+	long ret;
+
+	/* Install "kill on mknodat" filter. */
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
+	ASSERT_EQ(0, ret);
+
+	/* This should immediately die with SIGSYS, regardless of tracer. */
+	EXPECT_EQ(-1, syscall(__NR_mknodat, -1, NULL, 0, 0));
+}
+
 TEST_F(TRACE_syscall, skip_after)
 {
 	struct sock_filter filter[] = {
-- 
cgit v1.2.3


From 833907876be55205d0ec153dcd819c014404ee16 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 3 Nov 2021 10:32:09 -0700
Subject: libbpf: Detect corrupted ELF symbols section

Prevent divide-by-zero if ELF is corrupted and has zero sh_entsize.
Reported by oss-fuzz project.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20211103173213.1376990-2-andrii@kernel.org
---
 tools/lib/bpf/libbpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index a1bea1953df6..71f5a009010a 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -3555,7 +3555,7 @@ static int bpf_object__collect_externs(struct bpf_object *obj)
 
 	scn = elf_sec_by_idx(obj, obj->efile.symbols_shndx);
 	sh = elf_sec_hdr(obj, scn);
-	if (!sh)
+	if (!sh || sh->sh_entsize != sizeof(Elf64_Sym))
 		return -LIBBPF_ERRNO__FORMAT;
 
 	dummy_var_btf_id = add_dummy_ksym_var(obj->btf);
-- 
cgit v1.2.3


From 88918dc12dc357a06d8d722a684617b1c87a4654 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 3 Nov 2021 10:32:10 -0700
Subject: libbpf: Improve sanity checking during BTF fix up

If BTF is corrupted DATASEC's variable type ID might be incorrect.
Prevent this easy to detect situation with extra NULL check.
Reported by oss-fuzz project.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20211103173213.1376990-3-andrii@kernel.org
---
 tools/lib/bpf/libbpf.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 71f5a009010a..f836a1936597 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -2752,13 +2752,12 @@ static int btf_fixup_datasec(struct bpf_object *obj, struct btf *btf,
 
 	for (i = 0, vsi = btf_var_secinfos(t); i < vars; i++, vsi++) {
 		t_var = btf__type_by_id(btf, vsi->type);
-		var = btf_var(t_var);
-
-		if (!btf_is_var(t_var)) {
+		if (!t_var || !btf_is_var(t_var)) {
 			pr_debug("Non-VAR type seen in section %s\n", name);
 			return -EINVAL;
 		}
 
+		var = btf_var(t_var);
 		if (var->linkage == BTF_VAR_STATIC)
 			continue;
 
-- 
cgit v1.2.3


From 62554d52e71797eefa3fc15b54008038837bb2d4 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 3 Nov 2021 10:32:11 -0700
Subject: libbpf: Validate that .BTF and .BTF.ext sections contain data

.BTF and .BTF.ext ELF sections should have SHT_PROGBITS type and contain
data. If they are not, ELF is invalid or corrupted, so bail out.
Otherwise this can lead to data->d_buf being NULL and SIGSEGV later on.
Reported by oss-fuzz project.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20211103173213.1376990-4-andrii@kernel.org
---
 tools/lib/bpf/libbpf.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index f836a1936597..0dc6465271ce 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -3270,8 +3270,12 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
 		} else if (strcmp(name, MAPS_ELF_SEC) == 0) {
 			obj->efile.btf_maps_shndx = idx;
 		} else if (strcmp(name, BTF_ELF_SEC) == 0) {
+			if (sh->sh_type != SHT_PROGBITS)
+				return -LIBBPF_ERRNO__FORMAT;
 			btf_data = data;
 		} else if (strcmp(name, BTF_EXT_ELF_SEC) == 0) {
+			if (sh->sh_type != SHT_PROGBITS)
+				return -LIBBPF_ERRNO__FORMAT;
 			btf_ext_data = data;
 		} else if (sh->sh_type == SHT_SYMTAB) {
 			/* already processed during the first pass above */
-- 
cgit v1.2.3


From 0d6988e16a12ebd41d3e268992211b0ceba44ed7 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 3 Nov 2021 10:32:12 -0700
Subject: libbpf: Fix section counting logic

e_shnum does include section #0 and as such is exactly the number of ELF
sections that we need to allocate memory for to use section indices as
array indices. Fix the off-by-one error.

This is purely accounting fix, previously we were overallocating one
too many array items. But no correctness errors otherwise.

Fixes: 25bbbd7a444b ("libbpf: Remove assumptions about uniqueness of .rodata/.data/.bss maps")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20211103173213.1376990-5-andrii@kernel.org
---
 tools/lib/bpf/libbpf.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 0dc6465271ce..ecfea6c20042 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -3190,11 +3190,11 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
 	Elf_Scn *scn;
 	Elf64_Shdr *sh;
 
-	/* ELF section indices are 1-based, so allocate +1 element to keep
-	 * indexing simple. Also include 0th invalid section into sec_cnt for
-	 * simpler and more traditional iteration logic.
+	/* ELF section indices are 0-based, but sec #0 is special "invalid"
+	 * section. e_shnum does include sec #0, so e_shnum is the necessary
+	 * size of an array to keep all the sections.
 	 */
-	obj->efile.sec_cnt = 1 + obj->efile.ehdr->e_shnum;
+	obj->efile.sec_cnt = obj->efile.ehdr->e_shnum;
 	obj->efile.secs = calloc(obj->efile.sec_cnt, sizeof(*obj->efile.secs));
 	if (!obj->efile.secs)
 		return -ENOMEM;
-- 
cgit v1.2.3


From b7332d2820d394dd2ac127df1567b4da597355a1 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 3 Nov 2021 10:32:13 -0700
Subject: libbpf: Improve ELF relo sanitization

Add few sanity checks for relocations to prevent div-by-zero and
out-of-bounds array accesses in libbpf.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20211103173213.1376990-6-andrii@kernel.org
---
 tools/lib/bpf/libbpf.c | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index ecfea6c20042..86a44735230e 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -3306,6 +3306,10 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
 		} else if (sh->sh_type == SHT_REL) {
 			int targ_sec_idx = sh->sh_info; /* points to other section */
 
+			if (sh->sh_entsize != sizeof(Elf64_Rel) ||
+			    targ_sec_idx >= obj->efile.sec_cnt)
+				return -LIBBPF_ERRNO__FORMAT;
+
 			/* Only do relo for section with exec instructions */
 			if (!section_have_execinstr(obj, targ_sec_idx) &&
 			    strcmp(name, ".rel" STRUCT_OPS_SEC) &&
@@ -4025,7 +4029,7 @@ static int
 bpf_object__collect_prog_relos(struct bpf_object *obj, Elf64_Shdr *shdr, Elf_Data *data)
 {
 	const char *relo_sec_name, *sec_name;
-	size_t sec_idx = shdr->sh_info;
+	size_t sec_idx = shdr->sh_info, sym_idx;
 	struct bpf_program *prog;
 	struct reloc_desc *relos;
 	int err, i, nrels;
@@ -4036,6 +4040,9 @@ bpf_object__collect_prog_relos(struct bpf_object *obj, Elf64_Shdr *shdr, Elf_Dat
 	Elf64_Sym *sym;
 	Elf64_Rel *rel;
 
+	if (sec_idx >= obj->efile.sec_cnt)
+		return -EINVAL;
+
 	scn = elf_sec_by_idx(obj, sec_idx);
 	scn_data = elf_sec_data(obj, scn);
 
@@ -4055,16 +4062,23 @@ bpf_object__collect_prog_relos(struct bpf_object *obj, Elf64_Shdr *shdr, Elf_Dat
 			return -LIBBPF_ERRNO__FORMAT;
 		}
 
-		sym = elf_sym_by_idx(obj, ELF64_R_SYM(rel->r_info));
+		sym_idx = ELF64_R_SYM(rel->r_info);
+		sym = elf_sym_by_idx(obj, sym_idx);
 		if (!sym) {
-			pr_warn("sec '%s': symbol 0x%zx not found for relo #%d\n",
-				relo_sec_name, (size_t)ELF64_R_SYM(rel->r_info), i);
+			pr_warn("sec '%s': symbol #%zu not found for relo #%d\n",
+				relo_sec_name, sym_idx, i);
+			return -LIBBPF_ERRNO__FORMAT;
+		}
+
+		if (sym->st_shndx >= obj->efile.sec_cnt) {
+			pr_warn("sec '%s': corrupted symbol #%zu pointing to invalid section #%zu for relo #%d\n",
+				relo_sec_name, sym_idx, (size_t)sym->st_shndx, i);
 			return -LIBBPF_ERRNO__FORMAT;
 		}
 
 		if (rel->r_offset % BPF_INSN_SZ || rel->r_offset >= scn_data->d_size) {
 			pr_warn("sec '%s': invalid offset 0x%zx for relo #%d\n",
-				relo_sec_name, (size_t)ELF64_R_SYM(rel->r_info), i);
+				relo_sec_name, (size_t)rel->r_offset, i);
 			return -LIBBPF_ERRNO__FORMAT;
 		}
 
-- 
cgit v1.2.3


From be2f2d1680dfb36793ea8d3110edd4a1db496352 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 2 Nov 2021 22:14:49 -0700
Subject: libbpf: Deprecate bpf_program__load() API

Mark bpf_program__load() as deprecated ([0]) since v0.6. Also rename few
internal program loading bpf_object helper functions to have more
consistent naming.

  [0] Closes: https://github.com/libbpf/libbpf/issues/301

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211103051449.1884903-1-andrii@kernel.org
---
 tools/lib/bpf/libbpf.c | 36 ++++++++++++++++++++++--------------
 tools/lib/bpf/libbpf.h |  4 ++--
 2 files changed, 24 insertions(+), 16 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 86a44735230e..7fcea11ecaa9 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -6428,12 +6428,12 @@ static int libbpf_preload_prog(struct bpf_program *prog,
 	return 0;
 }
 
-static int
-load_program(struct bpf_program *prog, struct bpf_insn *insns, int insns_cnt,
-	     char *license, __u32 kern_version, int *pfd)
+static int bpf_object_load_prog_instance(struct bpf_object *obj, struct bpf_program *prog,
+					 struct bpf_insn *insns, int insns_cnt,
+					 const char *license, __u32 kern_version,
+					 int *prog_fd)
 {
 	struct bpf_prog_load_params load_attr = {};
-	struct bpf_object *obj = prog->obj;
 	char *cp, errmsg[STRERR_BUFSIZE];
 	size_t log_buf_size = 0;
 	char *log_buf = NULL;
@@ -6494,7 +6494,7 @@ load_program(struct bpf_program *prog, struct bpf_insn *insns, int insns_cnt,
 	if (obj->gen_loader) {
 		bpf_gen__prog_load(obj->gen_loader, &load_attr,
 				   prog - obj->programs);
-		*pfd = -1;
+		*prog_fd = -1;
 		return 0;
 	}
 retry_load:
@@ -6532,7 +6532,7 @@ retry_load:
 			}
 		}
 
-		*pfd = ret;
+		*prog_fd = ret;
 		ret = 0;
 		goto out;
 	}
@@ -6608,11 +6608,12 @@ static int bpf_program__record_externs(struct bpf_program *prog)
 	return 0;
 }
 
-int bpf_program__load(struct bpf_program *prog, char *license, __u32 kern_ver)
+static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog,
+				const char *license, __u32 kern_ver)
 {
 	int err = 0, fd, i;
 
-	if (prog->obj->loaded) {
+	if (obj->loaded) {
 		pr_warn("prog '%s': can't load after object was loaded\n", prog->name);
 		return libbpf_err(-EINVAL);
 	}
@@ -6638,10 +6639,11 @@ int bpf_program__load(struct bpf_program *prog, char *license, __u32 kern_ver)
 			pr_warn("prog '%s': inconsistent nr(%d) != 1\n",
 				prog->name, prog->instances.nr);
 		}
-		if (prog->obj->gen_loader)
+		if (obj->gen_loader)
 			bpf_program__record_externs(prog);
-		err = load_program(prog, prog->insns, prog->insns_cnt,
-				   license, kern_ver, &fd);
+		err = bpf_object_load_prog_instance(obj, prog,
+						    prog->insns, prog->insns_cnt,
+						    license, kern_ver, &fd);
 		if (!err)
 			prog->instances.fds[0] = fd;
 		goto out;
@@ -6669,8 +6671,9 @@ int bpf_program__load(struct bpf_program *prog, char *license, __u32 kern_ver)
 			continue;
 		}
 
-		err = load_program(prog, result.new_insn_ptr,
-				   result.new_insn_cnt, license, kern_ver, &fd);
+		err = bpf_object_load_prog_instance(obj, prog,
+						    result.new_insn_ptr, result.new_insn_cnt,
+						    license, kern_ver, &fd);
 		if (err) {
 			pr_warn("Loading the %dth instance of program '%s' failed\n",
 				i, prog->name);
@@ -6687,6 +6690,11 @@ out:
 	return libbpf_err(err);
 }
 
+int bpf_program__load(struct bpf_program *prog, const char *license, __u32 kern_ver)
+{
+	return bpf_object_load_prog(prog->obj, prog, license, kern_ver);
+}
+
 static int
 bpf_object__load_progs(struct bpf_object *obj, int log_level)
 {
@@ -6710,7 +6718,7 @@ bpf_object__load_progs(struct bpf_object *obj, int log_level)
 			continue;
 		}
 		prog->log_level |= log_level;
-		err = bpf_program__load(prog, obj->license, obj->kern_version);
+		err = bpf_object_load_prog(obj, prog, obj->license, obj->kern_version);
 		if (err)
 			return err;
 	}
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 797f5f8a0e20..a364c379b998 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -262,8 +262,8 @@ LIBBPF_API const struct bpf_insn *bpf_program__insns(const struct bpf_program *p
  */
 LIBBPF_API size_t bpf_program__insn_cnt(const struct bpf_program *prog);
 
-LIBBPF_API int bpf_program__load(struct bpf_program *prog, char *license,
-				 __u32 kern_version);
+LIBBPF_DEPRECATED_SINCE(0, 6, "use bpf_object__load() instead")
+LIBBPF_API int bpf_program__load(struct bpf_program *prog, const char *license, __u32 kern_version);
 LIBBPF_API int bpf_program__fd(const struct bpf_program *prog);
 LIBBPF_DEPRECATED_SINCE(0, 7, "multi-instance bpf_program support is deprecated")
 LIBBPF_API int bpf_program__pin_instance(struct bpf_program *prog,
-- 
cgit v1.2.3


From b8b5cb55f5d3f03cc1479a3768d68173a10359ad Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 5 Nov 2021 12:10:55 -0700
Subject: libbpf: Fix non-C89 loop variable declaration in gen_loader.c

Fix the `int i` declaration inside the for statement. This is non-C89
compliant. See [0] for user report breaking BCC build.

  [0] https://github.com/libbpf/libbpf/issues/403

Fixes: 18f4fccbf314 ("libbpf: Update gen_loader to emit BTF_KIND_FUNC relocations")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/bpf/20211105191055.3324874-1-andrii@kernel.org
---
 tools/lib/bpf/gen_loader.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c
index 502dea53a742..2e10776b6d85 100644
--- a/tools/lib/bpf/gen_loader.c
+++ b/tools/lib/bpf/gen_loader.c
@@ -584,8 +584,9 @@ void bpf_gen__record_extern(struct bpf_gen *gen, const char *name, bool is_weak,
 static struct ksym_desc *get_ksym_desc(struct bpf_gen *gen, struct ksym_relo_desc *relo)
 {
 	struct ksym_desc *kdesc;
+	int i;
 
-	for (int i = 0; i < gen->nr_ksyms; i++) {
+	for (i = 0; i < gen->nr_ksyms; i++) {
 		if (!strcmp(gen->ksyms[i].name, relo->name)) {
 			gen->ksyms[i].ref++;
 			return &gen->ksyms[i];
-- 
cgit v1.2.3


From be80e9cdbca8ac66d09e0e24e0bd41d992362a0b Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 3 Nov 2021 15:08:34 -0700
Subject: libbpf: Rename DECLARE_LIBBPF_OPTS into LIBBPF_OPTS

It's confusing that libbpf-provided helper macro doesn't start with
LIBBPF. Also "declare" vs "define" is confusing terminology, I can never
remember and always have to look up previous examples.

Bypass both issues by renaming DECLARE_LIBBPF_OPTS into a short and
clean LIBBPF_OPTS. To avoid breaking existing code, provide:

  #define DECLARE_LIBBPF_OPTS LIBBPF_OPTS

in libbpf_legacy.h. We can decide later if we ever want to remove it or
we'll keep it forever because it doesn't add any maintainability burden.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/bpf/20211103220845.2676888-2-andrii@kernel.org
---
 tools/lib/bpf/bpf.h           | 1 +
 tools/lib/bpf/libbpf_common.h | 2 +-
 tools/lib/bpf/libbpf_legacy.h | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 6fffb3cdf39b..f35146c1d9a9 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -29,6 +29,7 @@
 #include <stdint.h>
 
 #include "libbpf_common.h"
+#include "libbpf_legacy.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/tools/lib/bpf/libbpf_common.h b/tools/lib/bpf/libbpf_common.h
index aaa1efbf6f51..0967112b933a 100644
--- a/tools/lib/bpf/libbpf_common.h
+++ b/tools/lib/bpf/libbpf_common.h
@@ -54,7 +54,7 @@
  * including any extra padding, it with memset() and then assigns initial
  * values provided by users in struct initializer-syntax as varargs.
  */
-#define DECLARE_LIBBPF_OPTS(TYPE, NAME, ...)				    \
+#define LIBBPF_OPTS(TYPE, NAME, ...)					    \
 	struct TYPE NAME = ({ 						    \
 		memset(&NAME, 0, sizeof(struct TYPE));			    \
 		(struct TYPE) {						    \
diff --git a/tools/lib/bpf/libbpf_legacy.h b/tools/lib/bpf/libbpf_legacy.h
index 5ba5c9beccfa..bb03c568af7b 100644
--- a/tools/lib/bpf/libbpf_legacy.h
+++ b/tools/lib/bpf/libbpf_legacy.h
@@ -69,6 +69,7 @@ enum libbpf_strict_mode {
 
 LIBBPF_API int libbpf_set_strict_mode(enum libbpf_strict_mode mode);
 
+#define DECLARE_LIBBPF_OPTS LIBBPF_OPTS
 
 #ifdef __cplusplus
 } /* extern "C" */
-- 
cgit v1.2.3


From 45493cbaf59e3c9482e0e6a2646b362fff45db8b Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 3 Nov 2021 15:08:35 -0700
Subject: libbpf: Pass number of prog load attempts explicitly

Allow to control number of BPF_PROG_LOAD attempts from outside the
sys_bpf_prog_load() helper.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/bpf/20211103220845.2676888-3-andrii@kernel.org
---
 tools/lib/bpf/bpf.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index c09cbb868c9f..8e6a23c42560 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -74,14 +74,15 @@ static inline int sys_bpf_fd(enum bpf_cmd cmd, union bpf_attr *attr,
 	return ensure_good_fd(fd);
 }
 
-static inline int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size)
+#define PROG_LOAD_ATTEMPTS 5
+
+static inline int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size, int attempts)
 {
-	int retries = 5;
 	int fd;
 
 	do {
 		fd = sys_bpf_fd(BPF_PROG_LOAD, attr, size);
-	} while (fd < 0 && errno == EAGAIN && retries-- > 0);
+	} while (fd < 0 && errno == EAGAIN && --attempts > 0);
 
 	return fd;
 }
@@ -304,7 +305,7 @@ int libbpf__bpf_prog_load(const struct bpf_prog_load_params *load_attr)
 		memcpy(attr.prog_name, load_attr->name,
 		       min(strlen(load_attr->name), (size_t)BPF_OBJ_NAME_LEN - 1));
 
-	fd = sys_bpf_prog_load(&attr, sizeof(attr));
+	fd = sys_bpf_prog_load(&attr, sizeof(attr), PROG_LOAD_ATTEMPTS);
 	if (fd >= 0)
 		return fd;
 
@@ -345,7 +346,7 @@ int libbpf__bpf_prog_load(const struct bpf_prog_load_params *load_attr)
 			break;
 		}
 
-		fd = sys_bpf_prog_load(&attr, sizeof(attr));
+		fd = sys_bpf_prog_load(&attr, sizeof(attr), PROG_LOAD_ATTEMPTS);
 		if (fd >= 0)
 			goto done;
 	}
@@ -359,7 +360,7 @@ int libbpf__bpf_prog_load(const struct bpf_prog_load_params *load_attr)
 	attr.log_level = 1;
 	load_attr->log_buf[0] = 0;
 
-	fd = sys_bpf_prog_load(&attr, sizeof(attr));
+	fd = sys_bpf_prog_load(&attr, sizeof(attr), PROG_LOAD_ATTEMPTS);
 done:
 	/* free() doesn't affect errno, so we don't need to restore it */
 	free(finfo);
@@ -449,7 +450,7 @@ int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns,
 	attr.kern_version = kern_version;
 	attr.prog_flags = prog_flags;
 
-	fd = sys_bpf_prog_load(&attr, sizeof(attr));
+	fd = sys_bpf_prog_load(&attr, sizeof(attr), PROG_LOAD_ATTEMPTS);
 	return libbpf_err_errno(fd);
 }
 
-- 
cgit v1.2.3


From d10ef2b825cffd0807dd733fdfd6a5bea32270d7 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 3 Nov 2021 15:08:36 -0700
Subject: libbpf: Unify low-level BPF_PROG_LOAD APIs into bpf_prog_load()

Add a new unified OPTS-based low-level API for program loading,
bpf_prog_load() ([0]).  bpf_prog_load() accepts few "mandatory"
parameters as input arguments (program type, name, license,
instructions) and all the other optional (as in not required to specify
for all types of BPF programs) fields into struct bpf_prog_load_opts.

This makes all the other non-extensible APIs variant for BPF_PROG_LOAD
obsolete and they are slated for deprecation in libbpf v0.7:
  - bpf_load_program();
  - bpf_load_program_xattr();
  - bpf_verify_program().

Implementation-wise, internal helper libbpf__bpf_prog_load is refactored
to become a public bpf_prog_load() API. struct bpf_prog_load_params used
internally is replaced by public struct bpf_prog_load_opts.

Unfortunately, while conceptually all this is pretty straightforward,
the biggest complication comes from the already existing bpf_prog_load()
*high-level* API, which has nothing to do with BPF_PROG_LOAD command.

We try really hard to have a new API named bpf_prog_load(), though,
because it maps naturally to BPF_PROG_LOAD command.

For that, we rename old bpf_prog_load() into bpf_prog_load_deprecated()
and mark it as COMPAT_VERSION() for shared library users compiled
against old version of libbpf. Statically linked users and shared lib
users compiled against new version of libbpf headers will get "rerouted"
to bpf_prog_deprecated() through a macro helper that decides whether to
use new or old bpf_prog_load() based on number of input arguments (see
___libbpf_overload in libbpf_common.h).

To test that existing
bpf_prog_load()-using code compiles and works as expected, I've compiled
and ran selftests as is. I had to remove (locally) selftest/bpf/Makefile
-Dbpf_prog_load=bpf_prog_test_load hack because it was conflicting with
the macro-based overload approach. I don't expect anyone else to do
something like this in practice, though. This is testing-specific way to
replace bpf_prog_load() calls with special testing variant of it, which
adds extra prog_flags value. After testing I kept this selftests hack,
but ensured that we use a new bpf_prog_load_deprecated name for this.

This patch also marks bpf_prog_load() and bpf_prog_load_xattr() as deprecated.
bpf_object interface has to be used for working with struct bpf_program.
Libbpf doesn't support loading just a bpf_program.

The silver lining is that when we get to libbpf 1.0 all these
complication will be gone and we'll have one clean bpf_prog_load()
low-level API with no backwards compatibility hackery surrounding it.

  [0] Closes: https://github.com/libbpf/libbpf/issues/284

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211103220845.2676888-4-andrii@kernel.org
---
 tools/lib/bpf/bpf.c                  | 149 +++++++++++++++++++++--------------
 tools/lib/bpf/bpf.h                  |  73 ++++++++++++++++-
 tools/lib/bpf/bpf_gen_internal.h     |   8 +-
 tools/lib/bpf/gen_loader.c           |  30 +++----
 tools/lib/bpf/libbpf.c               |  51 ++++++------
 tools/lib/bpf/libbpf.h               |   5 +-
 tools/lib/bpf/libbpf.map             |   2 +
 tools/lib/bpf/libbpf_common.h        |  12 +++
 tools/lib/bpf/libbpf_internal.h      |  31 --------
 tools/testing/selftests/bpf/Makefile |   2 +-
 10 files changed, 223 insertions(+), 140 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 8e6a23c42560..8f2a701cb079 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -28,6 +28,7 @@
 #include <asm/unistd.h>
 #include <errno.h>
 #include <linux/bpf.h>
+#include <limits.h>
 #include "bpf.h"
 #include "libbpf.h"
 #include "libbpf_internal.h"
@@ -254,58 +255,91 @@ alloc_zero_tailing_info(const void *orecord, __u32 cnt,
 	return info;
 }
 
-int libbpf__bpf_prog_load(const struct bpf_prog_load_params *load_attr)
+DEFAULT_VERSION(bpf_prog_load_v0_6_0, bpf_prog_load, LIBBPF_0.6.0)
+int bpf_prog_load_v0_6_0(enum bpf_prog_type prog_type,
+		         const char *prog_name, const char *license,
+		         const struct bpf_insn *insns, size_t insn_cnt,
+		         const struct bpf_prog_load_opts *opts)
 {
 	void *finfo = NULL, *linfo = NULL;
+	const char *func_info, *line_info;
+	__u32 log_size, log_level, attach_prog_fd, attach_btf_obj_fd;
+	__u32 func_info_rec_size, line_info_rec_size;
+	int fd, attempts;
 	union bpf_attr attr;
-	int fd;
+	char *log_buf;
 
-	if (!load_attr->log_buf != !load_attr->log_buf_sz)
+	if (!OPTS_VALID(opts, bpf_prog_load_opts))
 		return libbpf_err(-EINVAL);
 
-	if (load_attr->log_level > (4 | 2 | 1) || (load_attr->log_level && !load_attr->log_buf))
+	attempts = OPTS_GET(opts, attempts, 0);
+	if (attempts < 0)
 		return libbpf_err(-EINVAL);
+	if (attempts == 0)
+		attempts = PROG_LOAD_ATTEMPTS;
 
 	memset(&attr, 0, sizeof(attr));
-	attr.prog_type = load_attr->prog_type;
-	attr.expected_attach_type = load_attr->expected_attach_type;
 
-	if (load_attr->attach_prog_fd)
-		attr.attach_prog_fd = load_attr->attach_prog_fd;
-	else
-		attr.attach_btf_obj_fd = load_attr->attach_btf_obj_fd;
-	attr.attach_btf_id = load_attr->attach_btf_id;
+	attr.prog_type = prog_type;
+	attr.expected_attach_type = OPTS_GET(opts, expected_attach_type, 0);
 
-	attr.prog_ifindex = load_attr->prog_ifindex;
-	attr.kern_version = load_attr->kern_version;
+	attr.prog_btf_fd = OPTS_GET(opts, prog_btf_fd, 0);
+	attr.prog_flags = OPTS_GET(opts, prog_flags, 0);
+	attr.prog_ifindex = OPTS_GET(opts, prog_ifindex, 0);
+	attr.kern_version = OPTS_GET(opts, kern_version, 0);
 
-	attr.insn_cnt = (__u32)load_attr->insn_cnt;
-	attr.insns = ptr_to_u64(load_attr->insns);
-	attr.license = ptr_to_u64(load_attr->license);
+	if (prog_name)
+		strncat(attr.prog_name, prog_name, sizeof(attr.prog_name) - 1);
+	attr.license = ptr_to_u64(license);
 
-	attr.log_level = load_attr->log_level;
-	if (attr.log_level) {
-		attr.log_buf = ptr_to_u64(load_attr->log_buf);
-		attr.log_size = load_attr->log_buf_sz;
-	}
+	if (insn_cnt > UINT_MAX)
+		return libbpf_err(-E2BIG);
 
-	attr.prog_btf_fd = load_attr->prog_btf_fd;
-	attr.prog_flags = load_attr->prog_flags;
+	attr.insns = ptr_to_u64(insns);
+	attr.insn_cnt = (__u32)insn_cnt;
 
-	attr.func_info_rec_size = load_attr->func_info_rec_size;
-	attr.func_info_cnt = load_attr->func_info_cnt;
-	attr.func_info = ptr_to_u64(load_attr->func_info);
+	attach_prog_fd = OPTS_GET(opts, attach_prog_fd, 0);
+	attach_btf_obj_fd = OPTS_GET(opts, attach_btf_obj_fd, 0);
 
-	attr.line_info_rec_size = load_attr->line_info_rec_size;
-	attr.line_info_cnt = load_attr->line_info_cnt;
-	attr.line_info = ptr_to_u64(load_attr->line_info);
-	attr.fd_array = ptr_to_u64(load_attr->fd_array);
+	if (attach_prog_fd && attach_btf_obj_fd)
+		return libbpf_err(-EINVAL);
 
-	if (load_attr->name)
-		memcpy(attr.prog_name, load_attr->name,
-		       min(strlen(load_attr->name), (size_t)BPF_OBJ_NAME_LEN - 1));
+	attr.attach_btf_id = OPTS_GET(opts, attach_btf_id, 0);
+	if (attach_prog_fd)
+		attr.attach_prog_fd = attach_prog_fd;
+	else
+		attr.attach_btf_obj_fd = attach_btf_obj_fd;
 
-	fd = sys_bpf_prog_load(&attr, sizeof(attr), PROG_LOAD_ATTEMPTS);
+	log_buf = OPTS_GET(opts, log_buf, NULL);
+	log_size = OPTS_GET(opts, log_size, 0);
+	log_level = OPTS_GET(opts, log_level, 0);
+
+	if (!!log_buf != !!log_size)
+		return libbpf_err(-EINVAL);
+	if (log_level > (4 | 2 | 1))
+		return libbpf_err(-EINVAL);
+	if (log_level && !log_buf)
+		return libbpf_err(-EINVAL);
+
+	attr.log_level = log_level;
+	attr.log_buf = ptr_to_u64(log_buf);
+	attr.log_size = log_size;
+
+	func_info_rec_size = OPTS_GET(opts, func_info_rec_size, 0);
+	func_info = OPTS_GET(opts, func_info, NULL);
+	attr.func_info_rec_size = func_info_rec_size;
+	attr.func_info = ptr_to_u64(func_info);
+	attr.func_info_cnt = OPTS_GET(opts, func_info_cnt, 0);
+
+	line_info_rec_size = OPTS_GET(opts, line_info_rec_size, 0);
+	line_info = OPTS_GET(opts, line_info, NULL);
+	attr.line_info_rec_size = line_info_rec_size;
+	attr.line_info = ptr_to_u64(line_info);
+	attr.line_info_cnt = OPTS_GET(opts, line_info_cnt, 0);
+
+	attr.fd_array = ptr_to_u64(OPTS_GET(opts, fd_array, NULL));
+
+	fd = sys_bpf_prog_load(&attr, sizeof(attr), attempts);
 	if (fd >= 0)
 		return fd;
 
@@ -315,11 +349,11 @@ int libbpf__bpf_prog_load(const struct bpf_prog_load_params *load_attr)
 	 */
 	while (errno == E2BIG && (!finfo || !linfo)) {
 		if (!finfo && attr.func_info_cnt &&
-		    attr.func_info_rec_size < load_attr->func_info_rec_size) {
+		    attr.func_info_rec_size < func_info_rec_size) {
 			/* try with corrected func info records */
-			finfo = alloc_zero_tailing_info(load_attr->func_info,
-							load_attr->func_info_cnt,
-							load_attr->func_info_rec_size,
+			finfo = alloc_zero_tailing_info(func_info,
+							attr.func_info_cnt,
+							func_info_rec_size,
 							attr.func_info_rec_size);
 			if (!finfo) {
 				errno = E2BIG;
@@ -327,13 +361,12 @@ int libbpf__bpf_prog_load(const struct bpf_prog_load_params *load_attr)
 			}
 
 			attr.func_info = ptr_to_u64(finfo);
-			attr.func_info_rec_size = load_attr->func_info_rec_size;
+			attr.func_info_rec_size = func_info_rec_size;
 		} else if (!linfo && attr.line_info_cnt &&
-			   attr.line_info_rec_size <
-			   load_attr->line_info_rec_size) {
-			linfo = alloc_zero_tailing_info(load_attr->line_info,
-							load_attr->line_info_cnt,
-							load_attr->line_info_rec_size,
+			   attr.line_info_rec_size < line_info_rec_size) {
+			linfo = alloc_zero_tailing_info(line_info,
+							attr.line_info_cnt,
+							line_info_rec_size,
 							attr.line_info_rec_size);
 			if (!linfo) {
 				errno = E2BIG;
@@ -341,26 +374,26 @@ int libbpf__bpf_prog_load(const struct bpf_prog_load_params *load_attr)
 			}
 
 			attr.line_info = ptr_to_u64(linfo);
-			attr.line_info_rec_size = load_attr->line_info_rec_size;
+			attr.line_info_rec_size = line_info_rec_size;
 		} else {
 			break;
 		}
 
-		fd = sys_bpf_prog_load(&attr, sizeof(attr), PROG_LOAD_ATTEMPTS);
+		fd = sys_bpf_prog_load(&attr, sizeof(attr), attempts);
 		if (fd >= 0)
 			goto done;
 	}
 
-	if (load_attr->log_level || !load_attr->log_buf)
+	if (log_level || !log_buf)
 		goto done;
 
 	/* Try again with log */
-	attr.log_buf = ptr_to_u64(load_attr->log_buf);
-	attr.log_size = load_attr->log_buf_sz;
+	log_buf[0] = 0;
+	attr.log_buf = ptr_to_u64(log_buf);
+	attr.log_size = log_size;
 	attr.log_level = 1;
-	load_attr->log_buf[0] = 0;
 
-	fd = sys_bpf_prog_load(&attr, sizeof(attr), PROG_LOAD_ATTEMPTS);
+	fd = sys_bpf_prog_load(&attr, sizeof(attr), attempts);
 done:
 	/* free() doesn't affect errno, so we don't need to restore it */
 	free(finfo);
@@ -371,14 +404,13 @@ done:
 int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr,
 			   char *log_buf, size_t log_buf_sz)
 {
-	struct bpf_prog_load_params p = {};
+	LIBBPF_OPTS(bpf_prog_load_opts, p);
 
 	if (!load_attr || !log_buf != !log_buf_sz)
 		return libbpf_err(-EINVAL);
 
-	p.prog_type = load_attr->prog_type;
 	p.expected_attach_type = load_attr->expected_attach_type;
-	switch (p.prog_type) {
+	switch (load_attr->prog_type) {
 	case BPF_PROG_TYPE_STRUCT_OPS:
 	case BPF_PROG_TYPE_LSM:
 		p.attach_btf_id = load_attr->attach_btf_id;
@@ -392,12 +424,9 @@ int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr,
 		p.prog_ifindex = load_attr->prog_ifindex;
 		p.kern_version = load_attr->kern_version;
 	}
-	p.insn_cnt = load_attr->insns_cnt;
-	p.insns = load_attr->insns;
-	p.license = load_attr->license;
 	p.log_level = load_attr->log_level;
 	p.log_buf = log_buf;
-	p.log_buf_sz = log_buf_sz;
+	p.log_size = log_buf_sz;
 	p.prog_btf_fd = load_attr->prog_btf_fd;
 	p.func_info_rec_size = load_attr->func_info_rec_size;
 	p.func_info_cnt = load_attr->func_info_cnt;
@@ -405,10 +434,10 @@ int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr,
 	p.line_info_rec_size = load_attr->line_info_rec_size;
 	p.line_info_cnt = load_attr->line_info_cnt;
 	p.line_info = load_attr->line_info;
-	p.name = load_attr->name;
 	p.prog_flags = load_attr->prog_flags;
 
-	return libbpf__bpf_prog_load(&p);
+	return bpf_prog_load(load_attr->prog_type, load_attr->name, load_attr->license,
+			     load_attr->insns, load_attr->insns_cnt, &p);
 }
 
 int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns,
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index f35146c1d9a9..079cc81ac51e 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -72,6 +72,71 @@ LIBBPF_API int bpf_create_map_in_map(enum bpf_map_type map_type,
 				     int inner_map_fd, int max_entries,
 				     __u32 map_flags);
 
+struct bpf_prog_load_opts {
+	size_t sz; /* size of this struct for forward/backward compatibility */
+
+	/* libbpf can retry BPF_PROG_LOAD command if bpf() syscall returns
+	 * -EAGAIN. This field determines how many attempts libbpf has to
+	 *  make. If not specified, libbpf will use default value of 5.
+	 */
+	int attempts;
+
+	enum bpf_attach_type expected_attach_type;
+	__u32 prog_btf_fd;
+	__u32 prog_flags;
+	__u32 prog_ifindex;
+	__u32 kern_version;
+
+	__u32 attach_btf_id;
+	__u32 attach_prog_fd;
+	__u32 attach_btf_obj_fd;
+
+	const int *fd_array;
+
+	/* .BTF.ext func info data */
+	const void *func_info;
+	__u32 func_info_cnt;
+	__u32 func_info_rec_size;
+
+	/* .BTF.ext line info data */
+	const void *line_info;
+	__u32 line_info_cnt;
+	__u32 line_info_rec_size;
+
+	/* verifier log options */
+	__u32 log_level;
+	__u32 log_size;
+	char *log_buf;
+};
+#define bpf_prog_load_opts__last_field log_buf
+
+LIBBPF_API int bpf_prog_load(enum bpf_prog_type prog_type,
+			     const char *prog_name, const char *license,
+			     const struct bpf_insn *insns, size_t insn_cnt,
+			     const struct bpf_prog_load_opts *opts);
+/* this "specialization" should go away in libbpf 1.0 */
+LIBBPF_API int bpf_prog_load_v0_6_0(enum bpf_prog_type prog_type,
+				    const char *prog_name, const char *license,
+				    const struct bpf_insn *insns, size_t insn_cnt,
+				    const struct bpf_prog_load_opts *opts);
+
+/* This is an elaborate way to not conflict with deprecated bpf_prog_load()
+ * API, defined in libbpf.h. Once we hit libbpf 1.0, all this will be gone.
+ * With this approach, if someone is calling bpf_prog_load() with
+ * 4 arguments, they will use the deprecated API, which keeps backwards
+ * compatibility (both source code and binary). If bpf_prog_load() is called
+ * with 6 arguments, though, it gets redirected to __bpf_prog_load.
+ * So looking forward to libbpf 1.0 when this hack will be gone and
+ * __bpf_prog_load() will be called just bpf_prog_load().
+ */
+#ifndef bpf_prog_load
+#define bpf_prog_load(...) ___libbpf_overload(___bpf_prog_load, __VA_ARGS__)
+#define ___bpf_prog_load4(file, type, pobj, prog_fd) \
+	bpf_prog_load_deprecated(file, type, pobj, prog_fd)
+#define ___bpf_prog_load6(prog_type, prog_name, license, insns, insn_cnt, opts) \
+	bpf_prog_load(prog_type, prog_name, license, insns, insn_cnt, opts)
+#endif /* bpf_prog_load */
+
 struct bpf_load_program_attr {
 	enum bpf_prog_type prog_type;
 	enum bpf_attach_type expected_attach_type;
@@ -103,13 +168,15 @@ struct bpf_load_program_attr {
 
 /* Recommend log buffer size */
 #define BPF_LOG_BUF_SIZE (UINT32_MAX >> 8) /* verifier maximum in kernels <= 5.1 */
-LIBBPF_API int
-bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr,
-		       char *log_buf, size_t log_buf_sz);
+LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_prog_load() instead")
+LIBBPF_API int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr,
+				      char *log_buf, size_t log_buf_sz);
+LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_prog_load() instead")
 LIBBPF_API int bpf_load_program(enum bpf_prog_type type,
 				const struct bpf_insn *insns, size_t insns_cnt,
 				const char *license, __u32 kern_version,
 				char *log_buf, size_t log_buf_sz);
+LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_prog_load() instead")
 LIBBPF_API int bpf_verify_program(enum bpf_prog_type type,
 				  const struct bpf_insn *insns,
 				  size_t insns_cnt, __u32 prog_flags,
diff --git a/tools/lib/bpf/bpf_gen_internal.h b/tools/lib/bpf/bpf_gen_internal.h
index d26e5472fe50..75ca9fb857b2 100644
--- a/tools/lib/bpf/bpf_gen_internal.h
+++ b/tools/lib/bpf/bpf_gen_internal.h
@@ -3,6 +3,8 @@
 #ifndef __BPF_GEN_INTERNAL_H
 #define __BPF_GEN_INTERNAL_H
 
+#include "bpf.h"
+
 struct ksym_relo_desc {
 	const char *name;
 	int kind;
@@ -50,8 +52,10 @@ int bpf_gen__finish(struct bpf_gen *gen);
 void bpf_gen__free(struct bpf_gen *gen);
 void bpf_gen__load_btf(struct bpf_gen *gen, const void *raw_data, __u32 raw_size);
 void bpf_gen__map_create(struct bpf_gen *gen, struct bpf_create_map_params *map_attr, int map_idx);
-struct bpf_prog_load_params;
-void bpf_gen__prog_load(struct bpf_gen *gen, struct bpf_prog_load_params *load_attr, int prog_idx);
+void bpf_gen__prog_load(struct bpf_gen *gen,
+			enum bpf_prog_type prog_type, const char *prog_name,
+			const char *license, struct bpf_insn *insns, size_t insn_cnt,
+			struct bpf_prog_load_opts *load_attr, int prog_idx);
 void bpf_gen__map_update_elem(struct bpf_gen *gen, int map_idx, void *value, __u32 value_size);
 void bpf_gen__map_freeze(struct bpf_gen *gen, int map_idx);
 void bpf_gen__record_attach_target(struct bpf_gen *gen, const char *name, enum bpf_attach_type type);
diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c
index 2e10776b6d85..7b73f97b1fa1 100644
--- a/tools/lib/bpf/gen_loader.c
+++ b/tools/lib/bpf/gen_loader.c
@@ -901,27 +901,27 @@ static void cleanup_relos(struct bpf_gen *gen, int insns)
 }
 
 void bpf_gen__prog_load(struct bpf_gen *gen,
-			struct bpf_prog_load_params *load_attr, int prog_idx)
+			enum bpf_prog_type prog_type, const char *prog_name,
+			const char *license, struct bpf_insn *insns, size_t insn_cnt,
+			struct bpf_prog_load_opts *load_attr, int prog_idx)
 {
 	int attr_size = offsetofend(union bpf_attr, fd_array);
-	int prog_load_attr, license, insns, func_info, line_info;
+	int prog_load_attr, license_off, insns_off, func_info, line_info;
 	union bpf_attr attr;
 
 	memset(&attr, 0, attr_size);
-	pr_debug("gen: prog_load: type %d insns_cnt %zd\n",
-		 load_attr->prog_type, load_attr->insn_cnt);
+	pr_debug("gen: prog_load: type %d insns_cnt %zd\n", prog_type, insn_cnt);
 	/* add license string to blob of bytes */
-	license = add_data(gen, load_attr->license, strlen(load_attr->license) + 1);
+	license_off = add_data(gen, license, strlen(license) + 1);
 	/* add insns to blob of bytes */
-	insns = add_data(gen, load_attr->insns,
-			 load_attr->insn_cnt * sizeof(struct bpf_insn));
+	insns_off = add_data(gen, insns, insn_cnt * sizeof(struct bpf_insn));
 
-	attr.prog_type = load_attr->prog_type;
+	attr.prog_type = prog_type;
 	attr.expected_attach_type = load_attr->expected_attach_type;
 	attr.attach_btf_id = load_attr->attach_btf_id;
 	attr.prog_ifindex = load_attr->prog_ifindex;
 	attr.kern_version = 0;
-	attr.insn_cnt = (__u32)load_attr->insn_cnt;
+	attr.insn_cnt = (__u32)insn_cnt;
 	attr.prog_flags = load_attr->prog_flags;
 
 	attr.func_info_rec_size = load_attr->func_info_rec_size;
@@ -934,15 +934,15 @@ void bpf_gen__prog_load(struct bpf_gen *gen,
 	line_info = add_data(gen, load_attr->line_info,
 			     attr.line_info_cnt * attr.line_info_rec_size);
 
-	memcpy(attr.prog_name, load_attr->name,
-	       min((unsigned)strlen(load_attr->name), BPF_OBJ_NAME_LEN - 1));
+	memcpy(attr.prog_name, prog_name,
+	       min((unsigned)strlen(prog_name), BPF_OBJ_NAME_LEN - 1));
 	prog_load_attr = add_data(gen, &attr, attr_size);
 
 	/* populate union bpf_attr with a pointer to license */
-	emit_rel_store(gen, attr_field(prog_load_attr, license), license);
+	emit_rel_store(gen, attr_field(prog_load_attr, license), license_off);
 
 	/* populate union bpf_attr with a pointer to instructions */
-	emit_rel_store(gen, attr_field(prog_load_attr, insns), insns);
+	emit_rel_store(gen, attr_field(prog_load_attr, insns), insns_off);
 
 	/* populate union bpf_attr with a pointer to func_info */
 	emit_rel_store(gen, attr_field(prog_load_attr, func_info), func_info);
@@ -974,12 +974,12 @@ void bpf_gen__prog_load(struct bpf_gen *gen,
 		emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_7,
 				      offsetof(union bpf_attr, attach_btf_obj_fd)));
 	}
-	emit_relos(gen, insns);
+	emit_relos(gen, insns_off);
 	/* emit PROG_LOAD command */
 	emit_sys_bpf(gen, BPF_PROG_LOAD, prog_load_attr, attr_size);
 	debug_ret(gen, "prog_load %s insn_cnt %d", attr.prog_name, attr.insn_cnt);
 	/* successful or not, close btf module FDs used in extern ksyms and attach_btf_obj_fd */
-	cleanup_relos(gen, insns);
+	cleanup_relos(gen, insns_off);
 	if (gen->attach_kind)
 		emit_sys_close_blob(gen,
 				    attr_field(prog_load_attr, attach_btf_obj_fd));
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 7fcea11ecaa9..7a82b81b8859 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -221,7 +221,7 @@ struct reloc_desc {
 struct bpf_sec_def;
 
 typedef int (*init_fn_t)(struct bpf_program *prog, long cookie);
-typedef int (*preload_fn_t)(struct bpf_program *prog, struct bpf_prog_load_params *attr, long cookie);
+typedef int (*preload_fn_t)(struct bpf_program *prog, struct bpf_prog_load_opts *opts, long cookie);
 typedef struct bpf_link *(*attach_fn_t)(const struct bpf_program *prog, long cookie);
 
 /* stored as sec_def->cookie for all libbpf-supported SEC()s */
@@ -6391,16 +6391,16 @@ static int libbpf_find_attach_btf_id(struct bpf_program *prog, const char *attac
 
 /* this is called as prog->sec_def->preload_fn for libbpf-supported sec_defs */
 static int libbpf_preload_prog(struct bpf_program *prog,
-			       struct bpf_prog_load_params *attr, long cookie)
+			       struct bpf_prog_load_opts *opts, long cookie)
 {
 	enum sec_def_flags def = cookie;
 
 	/* old kernels might not support specifying expected_attach_type */
 	if ((def & SEC_EXP_ATTACH_OPT) && !kernel_supports(prog->obj, FEAT_EXP_ATTACH_TYPE))
-		attr->expected_attach_type = 0;
+		opts->expected_attach_type = 0;
 
 	if (def & SEC_SLEEPABLE)
-		attr->prog_flags |= BPF_F_SLEEPABLE;
+		opts->prog_flags |= BPF_F_SLEEPABLE;
 
 	if ((prog->type == BPF_PROG_TYPE_TRACING ||
 	     prog->type == BPF_PROG_TYPE_LSM ||
@@ -6419,11 +6419,11 @@ static int libbpf_preload_prog(struct bpf_program *prog,
 
 		/* but by now libbpf common logic is not utilizing
 		 * prog->atach_btf_obj_fd/prog->attach_btf_id anymore because
-		 * this callback is called after attrs were populated by
-		 * libbpf, so this callback has to update attr explicitly here
+		 * this callback is called after opts were populated by
+		 * libbpf, so this callback has to update opts explicitly here
 		 */
-		attr->attach_btf_obj_fd = btf_obj_fd;
-		attr->attach_btf_id = btf_type_id;
+		opts->attach_btf_obj_fd = btf_obj_fd;
+		opts->attach_btf_id = btf_type_id;
 	}
 	return 0;
 }
@@ -6433,7 +6433,8 @@ static int bpf_object_load_prog_instance(struct bpf_object *obj, struct bpf_prog
 					 const char *license, __u32 kern_version,
 					 int *prog_fd)
 {
-	struct bpf_prog_load_params load_attr = {};
+	LIBBPF_OPTS(bpf_prog_load_opts, load_attr);
+	const char *prog_name = NULL;
 	char *cp, errmsg[STRERR_BUFSIZE];
 	size_t log_buf_size = 0;
 	char *log_buf = NULL;
@@ -6452,13 +6453,9 @@ static int bpf_object_load_prog_instance(struct bpf_object *obj, struct bpf_prog
 	if (!insns || !insns_cnt)
 		return -EINVAL;
 
-	load_attr.prog_type = prog->type;
 	load_attr.expected_attach_type = prog->expected_attach_type;
 	if (kernel_supports(obj, FEAT_PROG_NAME))
-		load_attr.name = prog->name;
-	load_attr.insns = insns;
-	load_attr.insn_cnt = insns_cnt;
-	load_attr.license = license;
+		prog_name = prog->name;
 	load_attr.attach_btf_id = prog->attach_btf_id;
 	load_attr.attach_prog_fd = prog->attach_prog_fd;
 	load_attr.attach_btf_obj_fd = prog->attach_btf_obj_fd;
@@ -6492,7 +6489,8 @@ static int bpf_object_load_prog_instance(struct bpf_object *obj, struct bpf_prog
 	}
 
 	if (obj->gen_loader) {
-		bpf_gen__prog_load(obj->gen_loader, &load_attr,
+		bpf_gen__prog_load(obj->gen_loader, prog->type, prog->name,
+				   license, insns, insns_cnt, &load_attr,
 				   prog - obj->programs);
 		*prog_fd = -1;
 		return 0;
@@ -6507,8 +6505,8 @@ retry_load:
 	}
 
 	load_attr.log_buf = log_buf;
-	load_attr.log_buf_sz = log_buf_size;
-	ret = libbpf__bpf_prog_load(&load_attr);
+	load_attr.log_size = log_buf_size;
+	ret = bpf_prog_load(prog->type, prog_name, license, insns, insns_cnt, &load_attr);
 
 	if (ret >= 0) {
 		if (log_buf && load_attr.log_level)
@@ -6554,19 +6552,19 @@ retry_load:
 		pr_warn("-- BEGIN DUMP LOG ---\n");
 		pr_warn("\n%s\n", log_buf);
 		pr_warn("-- END LOG --\n");
-	} else if (load_attr.insn_cnt >= BPF_MAXINSNS) {
-		pr_warn("Program too large (%zu insns), at most %d insns\n",
-			load_attr.insn_cnt, BPF_MAXINSNS);
+	} else if (insns_cnt >= BPF_MAXINSNS) {
+		pr_warn("Program too large (%d insns), at most %d insns\n",
+			insns_cnt, BPF_MAXINSNS);
 		ret = -LIBBPF_ERRNO__PROG2BIG;
-	} else if (load_attr.prog_type != BPF_PROG_TYPE_KPROBE) {
+	} else if (prog->type != BPF_PROG_TYPE_KPROBE) {
 		/* Wrong program type? */
 		int fd;
 
-		load_attr.prog_type = BPF_PROG_TYPE_KPROBE;
 		load_attr.expected_attach_type = 0;
 		load_attr.log_buf = NULL;
-		load_attr.log_buf_sz = 0;
-		fd = libbpf__bpf_prog_load(&load_attr);
+		load_attr.log_size = 0;
+		fd = bpf_prog_load(BPF_PROG_TYPE_KPROBE, prog_name, license,
+				   insns, insns_cnt, &load_attr);
 		if (fd >= 0) {
 			close(fd);
 			ret = -LIBBPF_ERRNO__PROGTYPE;
@@ -9170,8 +9168,9 @@ long libbpf_get_error(const void *ptr)
 	return -errno;
 }
 
-int bpf_prog_load(const char *file, enum bpf_prog_type type,
-		  struct bpf_object **pobj, int *prog_fd)
+COMPAT_VERSION(bpf_prog_load_deprecated, bpf_prog_load, LIBBPF_0.0.1)
+int bpf_prog_load_deprecated(const char *file, enum bpf_prog_type type,
+			     struct bpf_object **pobj, int *prog_fd)
 {
 	struct bpf_prog_load_attr attr;
 
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index a364c379b998..bbc828667b22 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -676,8 +676,9 @@ struct bpf_prog_load_attr {
 
 LIBBPF_API int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
 				   struct bpf_object **pobj, int *prog_fd);
-LIBBPF_API int bpf_prog_load(const char *file, enum bpf_prog_type type,
-			     struct bpf_object **pobj, int *prog_fd);
+LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_object__open() and bpf_object__load() instead")
+LIBBPF_API int bpf_prog_load_deprecated(const char *file, enum bpf_prog_type type,
+					struct bpf_object **pobj, int *prog_fd);
 
 /* XDP related API */
 struct xdp_link_info {
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 43580eb47740..b895861a13c0 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -395,6 +395,8 @@ LIBBPF_0.6.0 {
 		bpf_object__next_program;
 		bpf_object__prev_map;
 		bpf_object__prev_program;
+		bpf_prog_load_deprecated;
+		bpf_prog_load;
 		bpf_program__insn_cnt;
 		bpf_program__insns;
 		btf__add_btf;
diff --git a/tools/lib/bpf/libbpf_common.h b/tools/lib/bpf/libbpf_common.h
index 0967112b933a..b21cefc9c3b6 100644
--- a/tools/lib/bpf/libbpf_common.h
+++ b/tools/lib/bpf/libbpf_common.h
@@ -41,6 +41,18 @@
 #define __LIBBPF_MARK_DEPRECATED_0_7(X)
 #endif
 
+/* This set of internal macros allows to do "function overloading" based on
+ * number of arguments provided by used in backwards-compatible way during the
+ * transition to libbpf 1.0
+ * It's ugly but necessary evil that will be cleaned up when we get to 1.0.
+ * See bpf_prog_load() overload for example.
+ */
+#define ___libbpf_cat(A, B) A ## B
+#define ___libbpf_select(NAME, NUM) ___libbpf_cat(NAME, NUM)
+#define ___libbpf_nth(_1, _2, _3, _4, _5, _6, N, ...) N
+#define ___libbpf_cnt(...) ___libbpf_nth(__VA_ARGS__, 6, 5, 4, 3, 2, 1)
+#define ___libbpf_overload(NAME, ...) ___libbpf_select(NAME, ___libbpf_cnt(__VA_ARGS__))(__VA_ARGS__)
+
 /* Helper macro to declare and initialize libbpf options struct
  *
  * This dance with uninitialized declaration, followed by memset to zero,
diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index aeb79e3a8ff9..c1e34794b829 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -276,37 +276,6 @@ int parse_cpu_mask_file(const char *fcpu, bool **mask, int *mask_sz);
 int libbpf__load_raw_btf(const char *raw_types, size_t types_len,
 			 const char *str_sec, size_t str_len);
 
-struct bpf_prog_load_params {
-	enum bpf_prog_type prog_type;
-	enum bpf_attach_type expected_attach_type;
-	const char *name;
-	const struct bpf_insn *insns;
-	size_t insn_cnt;
-	const char *license;
-	__u32 kern_version;
-	__u32 attach_prog_fd;
-	__u32 attach_btf_obj_fd;
-	__u32 attach_btf_id;
-	__u32 prog_ifindex;
-	__u32 prog_btf_fd;
-	__u32 prog_flags;
-
-	__u32 func_info_rec_size;
-	const void *func_info;
-	__u32 func_info_cnt;
-
-	__u32 line_info_rec_size;
-	const void *line_info;
-	__u32 line_info_cnt;
-
-	__u32 log_level;
-	char *log_buf;
-	size_t log_buf_sz;
-	int *fd_array;
-};
-
-int libbpf__bpf_prog_load(const struct bpf_prog_load_params *load_attr);
-
 struct bpf_create_map_params {
 	const char *name;
 	enum bpf_map_type map_type;
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 54b0a41a3775..c4497a4af3fe 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -24,7 +24,7 @@ SAN_CFLAGS	?=
 CFLAGS += -g -O0 -rdynamic -Wall $(GENFLAGS) $(SAN_CFLAGS)		\
 	  -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR)		\
 	  -I$(TOOLSINCDIR) -I$(APIDIR) -I$(OUTPUT)			\
-	  -Dbpf_prog_load=bpf_prog_test_load				\
+	  -Dbpf_prog_load_deprecated=bpf_prog_test_load			\
 	  -Dbpf_load_program=bpf_test_load_program
 LDLIBS += -lcap -lelf -lz -lrt -lpthread
 
-- 
cgit v1.2.3


From e32660ac6fd6bd3c9d249644330d968c6ef61b07 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 3 Nov 2021 15:08:37 -0700
Subject: libbpf: Remove internal use of deprecated bpf_prog_load() variants

Remove all the internal uses of bpf_load_program_xattr(), which is
slated for deprecation in v0.7.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211103220845.2676888-5-andrii@kernel.org
---
 tools/lib/bpf/bpf.c           |   8 ++-
 tools/lib/bpf/libbpf.c        | 119 ++++++++++++++----------------------------
 tools/lib/bpf/libbpf_probes.c |  20 ++++---
 tools/lib/bpf/xsk.c           |  34 ++++--------
 4 files changed, 64 insertions(+), 117 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 8f2a701cb079..4ff45749f8cf 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -401,8 +401,12 @@ done:
 	return libbpf_err_errno(fd);
 }
 
+__attribute__((alias("bpf_load_program_xattr2")))
 int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr,
-			   char *log_buf, size_t log_buf_sz)
+			   char *log_buf, size_t log_buf_sz);
+
+static int bpf_load_program_xattr2(const struct bpf_load_program_attr *load_attr,
+				   char *log_buf, size_t log_buf_sz)
 {
 	LIBBPF_OPTS(bpf_prog_load_opts, p);
 
@@ -456,7 +460,7 @@ int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns,
 	load_attr.license = license;
 	load_attr.kern_version = kern_version;
 
-	return bpf_load_program_xattr(&load_attr, log_buf, log_buf_sz);
+	return bpf_load_program_xattr2(&load_attr, log_buf, log_buf_sz);
 }
 
 int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns,
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 7a82b81b8859..5751cade0f66 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -4282,30 +4282,20 @@ int bpf_map__resize(struct bpf_map *map, __u32 max_entries)
 static int
 bpf_object__probe_loading(struct bpf_object *obj)
 {
-	struct bpf_load_program_attr attr;
 	char *cp, errmsg[STRERR_BUFSIZE];
 	struct bpf_insn insns[] = {
 		BPF_MOV64_IMM(BPF_REG_0, 0),
 		BPF_EXIT_INSN(),
 	};
-	int ret;
+	int ret, insn_cnt = ARRAY_SIZE(insns);
 
 	if (obj->gen_loader)
 		return 0;
 
 	/* make sure basic loading works */
-
-	memset(&attr, 0, sizeof(attr));
-	attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
-	attr.insns = insns;
-	attr.insns_cnt = ARRAY_SIZE(insns);
-	attr.license = "GPL";
-
-	ret = bpf_load_program_xattr(&attr, NULL, 0);
-	if (ret < 0) {
-		attr.prog_type = BPF_PROG_TYPE_TRACEPOINT;
-		ret = bpf_load_program_xattr(&attr, NULL, 0);
-	}
+	ret = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, NULL);
+	if (ret < 0)
+		ret = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, NULL);
 	if (ret < 0) {
 		ret = errno;
 		cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg));
@@ -4329,28 +4319,19 @@ static int probe_fd(int fd)
 
 static int probe_kern_prog_name(void)
 {
-	struct bpf_load_program_attr attr;
 	struct bpf_insn insns[] = {
 		BPF_MOV64_IMM(BPF_REG_0, 0),
 		BPF_EXIT_INSN(),
 	};
-	int ret;
+	int ret, insn_cnt = ARRAY_SIZE(insns);
 
 	/* make sure loading with name works */
-
-	memset(&attr, 0, sizeof(attr));
-	attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
-	attr.insns = insns;
-	attr.insns_cnt = ARRAY_SIZE(insns);
-	attr.license = "GPL";
-	attr.name = "test";
-	ret = bpf_load_program_xattr(&attr, NULL, 0);
+	ret = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, "test", "GPL", insns, insn_cnt, NULL);
 	return probe_fd(ret);
 }
 
 static int probe_kern_global_data(void)
 {
-	struct bpf_load_program_attr prg_attr;
 	struct bpf_create_map_attr map_attr;
 	char *cp, errmsg[STRERR_BUFSIZE];
 	struct bpf_insn insns[] = {
@@ -4359,7 +4340,7 @@ static int probe_kern_global_data(void)
 		BPF_MOV64_IMM(BPF_REG_0, 0),
 		BPF_EXIT_INSN(),
 	};
-	int ret, map;
+	int ret, map, insn_cnt = ARRAY_SIZE(insns);
 
 	memset(&map_attr, 0, sizeof(map_attr));
 	map_attr.map_type = BPF_MAP_TYPE_ARRAY;
@@ -4378,13 +4359,7 @@ static int probe_kern_global_data(void)
 
 	insns[0].imm = map;
 
-	memset(&prg_attr, 0, sizeof(prg_attr));
-	prg_attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
-	prg_attr.insns = insns;
-	prg_attr.insns_cnt = ARRAY_SIZE(insns);
-	prg_attr.license = "GPL";
-
-	ret = bpf_load_program_xattr(&prg_attr, NULL, 0);
+	ret = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, NULL);
 	close(map);
 	return probe_fd(ret);
 }
@@ -4500,30 +4475,24 @@ static int probe_kern_array_mmap(void)
 
 static int probe_kern_exp_attach_type(void)
 {
-	struct bpf_load_program_attr attr;
+	LIBBPF_OPTS(bpf_prog_load_opts, opts, .expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE);
 	struct bpf_insn insns[] = {
 		BPF_MOV64_IMM(BPF_REG_0, 0),
 		BPF_EXIT_INSN(),
 	};
+	int fd, insn_cnt = ARRAY_SIZE(insns);
 
-	memset(&attr, 0, sizeof(attr));
 	/* use any valid combination of program type and (optional)
 	 * non-zero expected attach type (i.e., not a BPF_CGROUP_INET_INGRESS)
 	 * to see if kernel supports expected_attach_type field for
 	 * BPF_PROG_LOAD command
 	 */
-	attr.prog_type = BPF_PROG_TYPE_CGROUP_SOCK;
-	attr.expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE;
-	attr.insns = insns;
-	attr.insns_cnt = ARRAY_SIZE(insns);
-	attr.license = "GPL";
-
-	return probe_fd(bpf_load_program_xattr(&attr, NULL, 0));
+	fd = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK, NULL, "GPL", insns, insn_cnt, &opts);
+	return probe_fd(fd);
 }
 
 static int probe_kern_probe_read_kernel(void)
 {
-	struct bpf_load_program_attr attr;
 	struct bpf_insn insns[] = {
 		BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),	/* r1 = r10 (fp) */
 		BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),	/* r1 += -8 */
@@ -4532,26 +4501,21 @@ static int probe_kern_probe_read_kernel(void)
 		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_probe_read_kernel),
 		BPF_EXIT_INSN(),
 	};
+	int fd, insn_cnt = ARRAY_SIZE(insns);
 
-	memset(&attr, 0, sizeof(attr));
-	attr.prog_type = BPF_PROG_TYPE_KPROBE;
-	attr.insns = insns;
-	attr.insns_cnt = ARRAY_SIZE(insns);
-	attr.license = "GPL";
-
-	return probe_fd(bpf_load_program_xattr(&attr, NULL, 0));
+	fd = bpf_prog_load(BPF_PROG_TYPE_KPROBE, NULL, "GPL", insns, insn_cnt, NULL);
+	return probe_fd(fd);
 }
 
 static int probe_prog_bind_map(void)
 {
-	struct bpf_load_program_attr prg_attr;
 	struct bpf_create_map_attr map_attr;
 	char *cp, errmsg[STRERR_BUFSIZE];
 	struct bpf_insn insns[] = {
 		BPF_MOV64_IMM(BPF_REG_0, 0),
 		BPF_EXIT_INSN(),
 	};
-	int ret, map, prog;
+	int ret, map, prog, insn_cnt = ARRAY_SIZE(insns);
 
 	memset(&map_attr, 0, sizeof(map_attr));
 	map_attr.map_type = BPF_MAP_TYPE_ARRAY;
@@ -4568,13 +4532,7 @@ static int probe_prog_bind_map(void)
 		return ret;
 	}
 
-	memset(&prg_attr, 0, sizeof(prg_attr));
-	prg_attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
-	prg_attr.insns = insns;
-	prg_attr.insns_cnt = ARRAY_SIZE(insns);
-	prg_attr.license = "GPL";
-
-	prog = bpf_load_program_xattr(&prg_attr, NULL, 0);
+	prog = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, NULL);
 	if (prog < 0) {
 		close(map);
 		return 0;
@@ -4619,19 +4577,14 @@ static int probe_module_btf(void)
 
 static int probe_perf_link(void)
 {
-	struct bpf_load_program_attr attr;
 	struct bpf_insn insns[] = {
 		BPF_MOV64_IMM(BPF_REG_0, 0),
 		BPF_EXIT_INSN(),
 	};
 	int prog_fd, link_fd, err;
 
-	memset(&attr, 0, sizeof(attr));
-	attr.prog_type = BPF_PROG_TYPE_TRACEPOINT;
-	attr.insns = insns;
-	attr.insns_cnt = ARRAY_SIZE(insns);
-	attr.license = "GPL";
-	prog_fd = bpf_load_program_xattr(&attr, NULL, 0);
+	prog_fd = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL",
+				insns, ARRAY_SIZE(insns), NULL);
 	if (prog_fd < 0)
 		return -errno;
 
@@ -9168,22 +9121,12 @@ long libbpf_get_error(const void *ptr)
 	return -errno;
 }
 
-COMPAT_VERSION(bpf_prog_load_deprecated, bpf_prog_load, LIBBPF_0.0.1)
-int bpf_prog_load_deprecated(const char *file, enum bpf_prog_type type,
-			     struct bpf_object **pobj, int *prog_fd)
-{
-	struct bpf_prog_load_attr attr;
-
-	memset(&attr, 0, sizeof(struct bpf_prog_load_attr));
-	attr.file = file;
-	attr.prog_type = type;
-	attr.expected_attach_type = 0;
-
-	return bpf_prog_load_xattr(&attr, pobj, prog_fd);
-}
-
+__attribute__((alias("bpf_prog_load_xattr2")))
 int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
-			struct bpf_object **pobj, int *prog_fd)
+			struct bpf_object **pobj, int *prog_fd);
+
+static int bpf_prog_load_xattr2(const struct bpf_prog_load_attr *attr,
+				struct bpf_object **pobj, int *prog_fd)
 {
 	struct bpf_object_open_attr open_attr = {};
 	struct bpf_program *prog, *first_prog = NULL;
@@ -9254,6 +9197,20 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
 	return 0;
 }
 
+COMPAT_VERSION(bpf_prog_load_deprecated, bpf_prog_load, LIBBPF_0.0.1)
+int bpf_prog_load_deprecated(const char *file, enum bpf_prog_type type,
+			     struct bpf_object **pobj, int *prog_fd)
+{
+	struct bpf_prog_load_attr attr;
+
+	memset(&attr, 0, sizeof(struct bpf_prog_load_attr));
+	attr.file = file;
+	attr.prog_type = type;
+	attr.expected_attach_type = 0;
+
+	return bpf_prog_load_xattr2(&attr, pobj, prog_fd);
+}
+
 struct bpf_link {
 	int (*detach)(struct bpf_link *link);
 	void (*dealloc)(struct bpf_link *link);
diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
index 68f2dbf364aa..02c401e314c7 100644
--- a/tools/lib/bpf/libbpf_probes.c
+++ b/tools/lib/bpf/libbpf_probes.c
@@ -68,21 +68,21 @@ static void
 probe_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns,
 	   size_t insns_cnt, char *buf, size_t buf_len, __u32 ifindex)
 {
-	struct bpf_load_program_attr xattr = {};
+	LIBBPF_OPTS(bpf_prog_load_opts, opts);
 	int fd;
 
 	switch (prog_type) {
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
-		xattr.expected_attach_type = BPF_CGROUP_INET4_CONNECT;
+		opts.expected_attach_type = BPF_CGROUP_INET4_CONNECT;
 		break;
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
-		xattr.expected_attach_type = BPF_CGROUP_GETSOCKOPT;
+		opts.expected_attach_type = BPF_CGROUP_GETSOCKOPT;
 		break;
 	case BPF_PROG_TYPE_SK_LOOKUP:
-		xattr.expected_attach_type = BPF_SK_LOOKUP;
+		opts.expected_attach_type = BPF_SK_LOOKUP;
 		break;
 	case BPF_PROG_TYPE_KPROBE:
-		xattr.kern_version = get_kernel_version();
+		opts.kern_version = get_kernel_version();
 		break;
 	case BPF_PROG_TYPE_UNSPEC:
 	case BPF_PROG_TYPE_SOCKET_FILTER:
@@ -115,13 +115,11 @@ probe_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns,
 		break;
 	}
 
-	xattr.prog_type = prog_type;
-	xattr.insns = insns;
-	xattr.insns_cnt = insns_cnt;
-	xattr.license = "GPL";
-	xattr.prog_ifindex = ifindex;
+	opts.prog_ifindex = ifindex;
+	opts.log_buf = buf;
+	opts.log_size = buf_len;
 
-	fd = bpf_load_program_xattr(&xattr, buf, buf_len);
+	fd = bpf_prog_load(prog_type, NULL, "GPL", insns, insns_cnt, NULL);
 	if (fd >= 0)
 		close(fd);
 }
diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
index 81f8fbc85e70..fdb22f5405c9 100644
--- a/tools/lib/bpf/xsk.c
+++ b/tools/lib/bpf/xsk.c
@@ -364,7 +364,6 @@ int xsk_umem__create_v0_0_2(struct xsk_umem **umem_ptr, void *umem_area,
 static enum xsk_prog get_xsk_prog(void)
 {
 	enum xsk_prog detected = XSK_PROG_FALLBACK;
-	struct bpf_load_program_attr prog_attr;
 	struct bpf_create_map_attr map_attr;
 	__u32 size_out, retval, duration;
 	char data_in = 0, data_out;
@@ -375,7 +374,7 @@ static enum xsk_prog get_xsk_prog(void)
 		BPF_EMIT_CALL(BPF_FUNC_redirect_map),
 		BPF_EXIT_INSN(),
 	};
-	int prog_fd, map_fd, ret;
+	int prog_fd, map_fd, ret, insn_cnt = ARRAY_SIZE(insns);
 
 	memset(&map_attr, 0, sizeof(map_attr));
 	map_attr.map_type = BPF_MAP_TYPE_XSKMAP;
@@ -389,13 +388,7 @@ static enum xsk_prog get_xsk_prog(void)
 
 	insns[0].imm = map_fd;
 
-	memset(&prog_attr, 0, sizeof(prog_attr));
-	prog_attr.prog_type = BPF_PROG_TYPE_XDP;
-	prog_attr.insns = insns;
-	prog_attr.insns_cnt = ARRAY_SIZE(insns);
-	prog_attr.license = "GPL";
-
-	prog_fd = bpf_load_program_xattr(&prog_attr, NULL, 0);
+	prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "GPL", insns, insn_cnt, NULL);
 	if (prog_fd < 0) {
 		close(map_fd);
 		return detected;
@@ -495,10 +488,13 @@ static int xsk_load_xdp_prog(struct xsk_socket *xsk)
 	};
 	struct bpf_insn *progs[] = {prog, prog_redirect_flags};
 	enum xsk_prog option = get_xsk_prog();
+	LIBBPF_OPTS(bpf_prog_load_opts, opts,
+		.log_buf = log_buf,
+		.log_size = log_buf_size,
+	);
 
-	prog_fd = bpf_load_program(BPF_PROG_TYPE_XDP, progs[option], insns_cnt[option],
-				   "LGPL-2.1 or BSD-2-Clause", 0, log_buf,
-				   log_buf_size);
+	prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "LGPL-2.1 or BSD-2-Clause",
+				progs[option], insns_cnt[option], &opts);
 	if (prog_fd < 0) {
 		pr_warn("BPF log buffer:\n%s", log_buf);
 		return prog_fd;
@@ -725,14 +721,12 @@ static int xsk_link_lookup(int ifindex, __u32 *prog_id, int *link_fd)
 
 static bool xsk_probe_bpf_link(void)
 {
-	DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts,
-			    .flags = XDP_FLAGS_SKB_MODE);
-	struct bpf_load_program_attr prog_attr;
+	LIBBPF_OPTS(bpf_link_create_opts, opts, .flags = XDP_FLAGS_SKB_MODE);
 	struct bpf_insn insns[2] = {
 		BPF_MOV64_IMM(BPF_REG_0, XDP_PASS),
 		BPF_EXIT_INSN()
 	};
-	int prog_fd, link_fd = -1;
+	int prog_fd, link_fd = -1, insn_cnt = ARRAY_SIZE(insns);
 	int ifindex_lo = 1;
 	bool ret = false;
 	int err;
@@ -744,13 +738,7 @@ static bool xsk_probe_bpf_link(void)
 	if (link_fd >= 0)
 		return true;
 
-	memset(&prog_attr, 0, sizeof(prog_attr));
-	prog_attr.prog_type = BPF_PROG_TYPE_XDP;
-	prog_attr.insns = insns;
-	prog_attr.insns_cnt = ARRAY_SIZE(insns);
-	prog_attr.license = "GPL";
-
-	prog_fd = bpf_load_program_xattr(&prog_attr, NULL, 0);
+	prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "GPL", insns, insn_cnt, NULL);
 	if (prog_fd < 0)
 		return ret;
 
-- 
cgit v1.2.3


From bcc40fc0021d4b7c016f8bcf62bd4e21251fdee8 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 3 Nov 2021 15:08:38 -0700
Subject: libbpf: Stop using to-be-deprecated APIs

Remove all the internal uses of libbpf APIs that are slated to be
deprecated in v0.7.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211103220845.2676888-6-andrii@kernel.org
---
 tools/lib/bpf/libbpf.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 5751cade0f66..dfd15cc60ea7 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -7709,7 +7709,7 @@ int bpf_object__pin_maps(struct bpf_object *obj, const char *path)
 	return 0;
 
 err_unpin_maps:
-	while ((map = bpf_map__prev(map, obj))) {
+	while ((map = bpf_object__prev_map(obj, map))) {
 		if (!map->pin_path)
 			continue;
 
@@ -7789,7 +7789,7 @@ int bpf_object__pin_programs(struct bpf_object *obj, const char *path)
 	return 0;
 
 err_unpin_programs:
-	while ((prog = bpf_program__prev(prog, obj))) {
+	while ((prog = bpf_object__prev_program(obj, prog))) {
 		char buf[PATH_MAX];
 		int len;
 
@@ -8130,9 +8130,11 @@ int bpf_program__set_autoload(struct bpf_program *prog, bool autoload)
 	return 0;
 }
 
+static int bpf_program_nth_fd(const struct bpf_program *prog, int n);
+
 int bpf_program__fd(const struct bpf_program *prog)
 {
-	return bpf_program__nth_fd(prog, 0);
+	return bpf_program_nth_fd(prog, 0);
 }
 
 size_t bpf_program__size(const struct bpf_program *prog)
@@ -8178,7 +8180,10 @@ int bpf_program__set_prep(struct bpf_program *prog, int nr_instances,
 	return 0;
 }
 
-int bpf_program__nth_fd(const struct bpf_program *prog, int n)
+__attribute__((alias("bpf_program_nth_fd")))
+int bpf_program__nth_fd(const struct bpf_program *prog, int n);
+
+static int bpf_program_nth_fd(const struct bpf_program *prog, int n)
 {
 	int fd;
 
-- 
cgit v1.2.3


From a3c7c7e8050fc299b42fa3d89bac253a8dfa5c0c Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 3 Nov 2021 15:08:39 -0700
Subject: bpftool: Stop using deprecated bpf_load_program()

Switch to bpf_prog_load() instead.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211103220845.2676888-7-andrii@kernel.org
---
 tools/bpf/bpftool/feature.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c
index ade44577688e..5397077d0d9e 100644
--- a/tools/bpf/bpftool/feature.c
+++ b/tools/bpf/bpftool/feature.c
@@ -467,7 +467,7 @@ static bool probe_bpf_syscall(const char *define_prefix)
 {
 	bool res;
 
-	bpf_load_program(BPF_PROG_TYPE_UNSPEC, NULL, 0, NULL, 0, NULL, 0);
+	bpf_prog_load(BPF_PROG_TYPE_UNSPEC, NULL, NULL, NULL, 0, NULL);
 	res = (errno != ENOSYS);
 
 	print_bool_feature("have_bpf_syscall",
-- 
cgit v1.2.3


From 5c5edcdebfcf3a95257b0d8ef27a60af0e0ea03a Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 3 Nov 2021 15:08:40 -0700
Subject: libbpf: Remove deprecation attribute from struct bpf_prog_prep_result

This deprecation annotation has no effect because for struct deprecation
attribute has to be declared after struct definition. But instead of
moving it to the end of struct definition, remove it. When deprecation
will go in effect at libbpf v0.7, this deprecation attribute will cause
libbpf's own source code compilation to trigger deprecation warnings,
which is unavoidable because libbpf still has to support that API.

So keep deprecation of APIs, but don't mark structs used in API as
deprecated.

Fixes: e21d585cb3db ("libbpf: Deprecate multi-instance bpf_program APIs")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/bpf/20211103220845.2676888-8-andrii@kernel.org
---
 tools/lib/bpf/libbpf.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index bbc828667b22..039058763173 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -431,7 +431,6 @@ bpf_program__attach_iter(const struct bpf_program *prog,
  * one instance. In this case bpf_program__fd(prog) is equal to
  * bpf_program__nth_fd(prog, 0).
  */
-LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_program__insns() for getting bpf_program instructions")
 struct bpf_prog_prep_result {
 	/*
 	 * If not NULL, load new instruction array.
-- 
cgit v1.2.3


From 3d1d62397f4a12dedee09727b26cd5a4b254ebb7 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 3 Nov 2021 15:08:41 -0700
Subject: selftests/bpf: Fix non-strict SEC() program sections

Fix few more SEC() definitions that were previously missed.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/bpf/20211103220845.2676888-9-andrii@kernel.org
---
 tools/testing/selftests/bpf/progs/test_l4lb.c            | 2 +-
 tools/testing/selftests/bpf/progs/test_l4lb_noinline.c   | 2 +-
 tools/testing/selftests/bpf/progs/test_map_lock.c        | 2 +-
 tools/testing/selftests/bpf/progs/test_queue_stack_map.h | 2 +-
 tools/testing/selftests/bpf/progs/test_skb_ctx.c         | 2 +-
 tools/testing/selftests/bpf/progs/test_spin_lock.c       | 2 +-
 tools/testing/selftests/bpf/progs/test_tcp_estats.c      | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/test_l4lb.c b/tools/testing/selftests/bpf/progs/test_l4lb.c
index 04fee08863cb..c26057ec46dc 100644
--- a/tools/testing/selftests/bpf/progs/test_l4lb.c
+++ b/tools/testing/selftests/bpf/progs/test_l4lb.c
@@ -448,7 +448,7 @@ static __always_inline int process_packet(void *data, __u64 off, void *data_end,
 	return bpf_redirect(ifindex, 0);
 }
 
-SEC("l4lb-demo")
+SEC("tc")
 int balancer_ingress(struct __sk_buff *ctx)
 {
 	void *data_end = (void *)(long)ctx->data_end;
diff --git a/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c b/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c
index b9e2753f4f91..19e4d2071c60 100644
--- a/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c
+++ b/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c
@@ -447,7 +447,7 @@ static __noinline int process_packet(void *data, __u64 off, void *data_end,
 	return bpf_redirect(ifindex, 0);
 }
 
-SEC("l4lb-demo")
+SEC("tc")
 int balancer_ingress(struct __sk_buff *ctx)
 {
 	void *data_end = (void *)(long)ctx->data_end;
diff --git a/tools/testing/selftests/bpf/progs/test_map_lock.c b/tools/testing/selftests/bpf/progs/test_map_lock.c
index b5c07ae7b68f..acf073db9e8b 100644
--- a/tools/testing/selftests/bpf/progs/test_map_lock.c
+++ b/tools/testing/selftests/bpf/progs/test_map_lock.c
@@ -30,7 +30,7 @@ struct {
 	__type(value, struct array_elem);
 } array_map SEC(".maps");
 
-SEC("map_lock_demo")
+SEC("cgroup/skb")
 int bpf_map_lock_test(struct __sk_buff *skb)
 {
 	struct hmap_elem zero = {}, *val;
diff --git a/tools/testing/selftests/bpf/progs/test_queue_stack_map.h b/tools/testing/selftests/bpf/progs/test_queue_stack_map.h
index 0fcd3ff0e38a..648e8cab7a23 100644
--- a/tools/testing/selftests/bpf/progs/test_queue_stack_map.h
+++ b/tools/testing/selftests/bpf/progs/test_queue_stack_map.h
@@ -24,7 +24,7 @@ struct {
 	__uint(value_size, sizeof(__u32));
 } map_out SEC(".maps");
 
-SEC("test")
+SEC("tc")
 int _test(struct __sk_buff *skb)
 {
 	void *data_end = (void *)(long)skb->data_end;
diff --git a/tools/testing/selftests/bpf/progs/test_skb_ctx.c b/tools/testing/selftests/bpf/progs/test_skb_ctx.c
index 1d61b36e6067..c482110cfc95 100644
--- a/tools/testing/selftests/bpf/progs/test_skb_ctx.c
+++ b/tools/testing/selftests/bpf/progs/test_skb_ctx.c
@@ -5,7 +5,7 @@
 
 char _license[] SEC("license") = "GPL";
 
-SEC("skb_ctx")
+SEC("tc")
 int process(struct __sk_buff *skb)
 {
 	#pragma clang loop unroll(full)
diff --git a/tools/testing/selftests/bpf/progs/test_spin_lock.c b/tools/testing/selftests/bpf/progs/test_spin_lock.c
index 0d31a3b3505f..7e88309d3229 100644
--- a/tools/testing/selftests/bpf/progs/test_spin_lock.c
+++ b/tools/testing/selftests/bpf/progs/test_spin_lock.c
@@ -45,7 +45,7 @@ struct {
 
 #define CREDIT_PER_NS(delta, rate) (((delta) * rate) >> 20)
 
-SEC("spin_lock_demo")
+SEC("tc")
 int bpf_sping_lock_test(struct __sk_buff *skb)
 {
 	volatile int credit = 0, max_credit = 100, pkt_len = 64;
diff --git a/tools/testing/selftests/bpf/progs/test_tcp_estats.c b/tools/testing/selftests/bpf/progs/test_tcp_estats.c
index 2c5c602c6011..e2ae049c2f85 100644
--- a/tools/testing/selftests/bpf/progs/test_tcp_estats.c
+++ b/tools/testing/selftests/bpf/progs/test_tcp_estats.c
@@ -244,7 +244,7 @@ static __always_inline void send_basic_event(struct sock *sk,
 	bpf_map_update_elem(&ev_record_map, &key, &ev, BPF_ANY);
 }
 
-SEC("dummy_tracepoint")
+SEC("tp/dummy/tracepoint")
 int _dummy_tracepoint(struct dummy_tracepoint_args *arg)
 {
 	if (!arg->sock)
-- 
cgit v1.2.3


From d8e86407e5fc6c3da1e336f89bd3e9bbc1c0cf60 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 3 Nov 2021 15:08:42 -0700
Subject: selftests/bpf: Convert legacy prog load APIs to bpf_prog_load()

Convert all the uses of legacy low-level BPF program loading APIs
(mostly bpf_load_program_xattr(), but also some bpf_verify_program()) to
bpf_prog_load() uses.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211103220845.2676888-10-andrii@kernel.org
---
 tools/testing/selftests/bpf/prog_tests/align.c     | 11 +++++--
 .../selftests/bpf/prog_tests/fexit_stress.c        | 33 ++++++++++---------
 .../raw_tp_writable_reject_nbd_invalid.c           | 14 ++++----
 .../bpf/prog_tests/raw_tp_writable_test_run.c      | 29 ++++++++---------
 tools/testing/selftests/bpf/prog_tests/sockopt.c   | 19 +++++------
 tools/testing/selftests/bpf/test_lru_map.c         |  9 +----
 tools/testing/selftests/bpf/test_sock.c            | 23 +++++++------
 tools/testing/selftests/bpf/test_sock_addr.c       | 13 +++-----
 tools/testing/selftests/bpf/test_sysctl.c          | 22 +++++--------
 tools/testing/selftests/bpf/test_verifier.c        | 38 +++++++++++-----------
 10 files changed, 99 insertions(+), 112 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/align.c b/tools/testing/selftests/bpf/prog_tests/align.c
index 5861446d0777..837f67c6bfda 100644
--- a/tools/testing/selftests/bpf/prog_tests/align.c
+++ b/tools/testing/selftests/bpf/prog_tests/align.c
@@ -594,6 +594,12 @@ static int do_test_single(struct bpf_align_test *test)
 	struct bpf_insn *prog = test->insns;
 	int prog_type = test->prog_type;
 	char bpf_vlog_copy[32768];
+	LIBBPF_OPTS(bpf_prog_load_opts, opts,
+		.prog_flags = BPF_F_STRICT_ALIGNMENT,
+		.log_buf = bpf_vlog,
+		.log_size = sizeof(bpf_vlog),
+		.log_level = 2,
+	);
 	const char *line_ptr;
 	int cur_line = -1;
 	int prog_len, i;
@@ -601,9 +607,8 @@ static int do_test_single(struct bpf_align_test *test)
 	int ret;
 
 	prog_len = probe_filter_length(prog);
-	fd_prog = bpf_verify_program(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER,
-				     prog, prog_len, BPF_F_STRICT_ALIGNMENT,
-				     "GPL", 0, bpf_vlog, sizeof(bpf_vlog), 2);
+	fd_prog = bpf_prog_load(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL",
+				prog, prog_len, &opts);
 	if (fd_prog < 0 && test->result != REJECT) {
 		printf("Failed to load program.\n");
 		printf("%s", bpf_vlog);
diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_stress.c b/tools/testing/selftests/bpf/prog_tests/fexit_stress.c
index 7c9b62e971f1..e4cede6b4b2d 100644
--- a/tools/testing/selftests/bpf/prog_tests/fexit_stress.c
+++ b/tools/testing/selftests/bpf/prog_tests/fexit_stress.c
@@ -20,34 +20,33 @@ void test_fexit_stress(void)
 		BPF_EXIT_INSN(),
 	};
 
-	struct bpf_load_program_attr load_attr = {
-		.prog_type = BPF_PROG_TYPE_TRACING,
-		.license = "GPL",
-		.insns = trace_program,
-		.insns_cnt = sizeof(trace_program) / sizeof(struct bpf_insn),
+	LIBBPF_OPTS(bpf_prog_load_opts, trace_opts,
 		.expected_attach_type = BPF_TRACE_FEXIT,
-	};
+		.log_buf = error,
+		.log_size = sizeof(error),
+	);
 
 	const struct bpf_insn skb_program[] = {
 		BPF_MOV64_IMM(BPF_REG_0, 0),
 		BPF_EXIT_INSN(),
 	};
 
-	struct bpf_load_program_attr skb_load_attr = {
-		.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
-		.license = "GPL",
-		.insns = skb_program,
-		.insns_cnt = sizeof(skb_program) / sizeof(struct bpf_insn),
-	};
+	LIBBPF_OPTS(bpf_prog_load_opts, skb_opts,
+		.log_buf = error,
+		.log_size = sizeof(error),
+	);
 
 	err = libbpf_find_vmlinux_btf_id("bpf_fentry_test1",
-					 load_attr.expected_attach_type);
+					 trace_opts.expected_attach_type);
 	if (CHECK(err <= 0, "find_vmlinux_btf_id", "failed: %d\n", err))
 		goto out;
-	load_attr.attach_btf_id = err;
+	trace_opts.attach_btf_id = err;
 
 	for (i = 0; i < CNT; i++) {
-		fexit_fd[i] = bpf_load_program_xattr(&load_attr, error, sizeof(error));
+		fexit_fd[i] = bpf_prog_load(BPF_PROG_TYPE_TRACING, NULL, "GPL",
+					    trace_program,
+					    sizeof(trace_program) / sizeof(struct bpf_insn),
+					    &trace_opts);
 		if (CHECK(fexit_fd[i] < 0, "fexit loaded",
 			  "failed: %d errno %d\n", fexit_fd[i], errno))
 			goto out;
@@ -57,7 +56,9 @@ void test_fexit_stress(void)
 			goto out;
 	}
 
-	filter_fd = bpf_load_program_xattr(&skb_load_attr, error, sizeof(error));
+	filter_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL",
+				  skb_program, sizeof(skb_program) / sizeof(struct bpf_insn),
+				  &skb_opts);
 	if (CHECK(filter_fd < 0, "test_program_loaded", "failed: %d errno %d\n",
 		  filter_fd, errno))
 		goto out;
diff --git a/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_reject_nbd_invalid.c b/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_reject_nbd_invalid.c
index 9807336a3016..e2f1445b0e10 100644
--- a/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_reject_nbd_invalid.c
+++ b/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_reject_nbd_invalid.c
@@ -18,15 +18,15 @@ void test_raw_tp_writable_reject_nbd_invalid(void)
 		BPF_EXIT_INSN(),
 	};
 
-	struct bpf_load_program_attr load_attr = {
-		.prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
-		.license = "GPL v2",
-		.insns = program,
-		.insns_cnt = sizeof(program) / sizeof(struct bpf_insn),
+	LIBBPF_OPTS(bpf_prog_load_opts, opts,
 		.log_level = 2,
-	};
+		.log_buf = error,
+		.log_size = sizeof(error),
+	);
 
-	bpf_fd = bpf_load_program_xattr(&load_attr, error, sizeof(error));
+	bpf_fd = bpf_prog_load(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, NULL, "GPL v2",
+			       program, sizeof(program) / sizeof(struct bpf_insn),
+			       &opts);
 	if (CHECK(bpf_fd < 0, "bpf_raw_tracepoint_writable load",
 		  "failed: %d errno %d\n", bpf_fd, errno))
 		return;
diff --git a/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_run.c b/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_run.c
index ddefa1192e5d..239baccabccb 100644
--- a/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_run.c
+++ b/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_run.c
@@ -17,15 +17,15 @@ void serial_test_raw_tp_writable_test_run(void)
 		BPF_EXIT_INSN(),
 	};
 
-	struct bpf_load_program_attr load_attr = {
-		.prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
-		.license = "GPL v2",
-		.insns = trace_program,
-		.insns_cnt = sizeof(trace_program) / sizeof(struct bpf_insn),
+	LIBBPF_OPTS(bpf_prog_load_opts, trace_opts,
 		.log_level = 2,
-	};
+		.log_buf = error,
+		.log_size = sizeof(error),
+	);
 
-	int bpf_fd = bpf_load_program_xattr(&load_attr, error, sizeof(error));
+	int bpf_fd = bpf_prog_load(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, NULL, "GPL v2",
+				   trace_program, sizeof(trace_program) / sizeof(struct bpf_insn),
+				   &trace_opts);
 	if (CHECK(bpf_fd < 0, "bpf_raw_tracepoint_writable loaded",
 		  "failed: %d errno %d\n", bpf_fd, errno))
 		return;
@@ -35,15 +35,14 @@ void serial_test_raw_tp_writable_test_run(void)
 		BPF_EXIT_INSN(),
 	};
 
-	struct bpf_load_program_attr skb_load_attr = {
-		.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
-		.license = "GPL v2",
-		.insns = skb_program,
-		.insns_cnt = sizeof(skb_program) / sizeof(struct bpf_insn),
-	};
+	LIBBPF_OPTS(bpf_prog_load_opts, skb_opts,
+		.log_buf = error,
+		.log_size = sizeof(error),
+	);
 
-	int filter_fd =
-		bpf_load_program_xattr(&skb_load_attr, error, sizeof(error));
+	int filter_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL v2",
+				      skb_program, sizeof(skb_program) / sizeof(struct bpf_insn),
+				      &skb_opts);
 	if (CHECK(filter_fd < 0, "test_program_loaded", "failed: %d errno %d\n",
 		  filter_fd, errno))
 		goto out_bpffd;
diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt.c b/tools/testing/selftests/bpf/prog_tests/sockopt.c
index 3e8517a8395a..cd09f4c7dd92 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockopt.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt.c
@@ -852,22 +852,21 @@ static struct sockopt_test {
 static int load_prog(const struct bpf_insn *insns,
 		     enum bpf_attach_type expected_attach_type)
 {
-	struct bpf_load_program_attr attr = {
-		.prog_type = BPF_PROG_TYPE_CGROUP_SOCKOPT,
+	LIBBPF_OPTS(bpf_prog_load_opts, opts,
 		.expected_attach_type = expected_attach_type,
-		.insns = insns,
-		.license = "GPL",
 		.log_level = 2,
-	};
-	int fd;
+		.log_buf = bpf_log_buf,
+		.log_size = sizeof(bpf_log_buf),
+	);
+	int fd, insns_cnt = 0;
 
 	for (;
-	     insns[attr.insns_cnt].code != (BPF_JMP | BPF_EXIT);
-	     attr.insns_cnt++) {
+	     insns[insns_cnt].code != (BPF_JMP | BPF_EXIT);
+	     insns_cnt++) {
 	}
-	attr.insns_cnt++;
+	insns_cnt++;
 
-	fd = bpf_load_program_xattr(&attr, bpf_log_buf, sizeof(bpf_log_buf));
+	fd = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCKOPT, NULL, "GPL", insns, insns_cnt, &opts);
 	if (verbose && fd < 0)
 		fprintf(stderr, "%s\n", bpf_log_buf);
 
diff --git a/tools/testing/selftests/bpf/test_lru_map.c b/tools/testing/selftests/bpf/test_lru_map.c
index 7e9049fa3edf..7f3d1d8460b4 100644
--- a/tools/testing/selftests/bpf/test_lru_map.c
+++ b/tools/testing/selftests/bpf/test_lru_map.c
@@ -42,7 +42,6 @@ static int create_map(int map_type, int map_flags, unsigned int size)
 static int bpf_map_lookup_elem_with_ref_bit(int fd, unsigned long long key,
 					    void *value)
 {
-	struct bpf_load_program_attr prog;
 	struct bpf_create_map_attr map;
 	struct bpf_insn insns[] = {
 		BPF_LD_MAP_VALUE(BPF_REG_9, 0, 0),
@@ -76,13 +75,7 @@ static int bpf_map_lookup_elem_with_ref_bit(int fd, unsigned long long key,
 
 	insns[0].imm = mfd;
 
-	memset(&prog, 0, sizeof(prog));
-	prog.prog_type = BPF_PROG_TYPE_SCHED_CLS;
-	prog.insns = insns;
-	prog.insns_cnt = ARRAY_SIZE(insns);
-	prog.license = "GPL";
-
-	pfd = bpf_load_program_xattr(&prog, NULL, 0);
+	pfd = bpf_prog_load(BPF_PROG_TYPE_SCHED_CLS, NULL, "GPL", insns, ARRAY_SIZE(insns), NULL);
 	if (pfd < 0) {
 		close(mfd);
 		return -1;
diff --git a/tools/testing/selftests/bpf/test_sock.c b/tools/testing/selftests/bpf/test_sock.c
index 9613f7538840..e8edd3dd3ec2 100644
--- a/tools/testing/selftests/bpf/test_sock.c
+++ b/tools/testing/selftests/bpf/test_sock.c
@@ -328,18 +328,17 @@ static size_t probe_prog_length(const struct bpf_insn *fp)
 static int load_sock_prog(const struct bpf_insn *prog,
 			  enum bpf_attach_type attach_type)
 {
-	struct bpf_load_program_attr attr;
-	int ret;
-
-	memset(&attr, 0, sizeof(struct bpf_load_program_attr));
-	attr.prog_type = BPF_PROG_TYPE_CGROUP_SOCK;
-	attr.expected_attach_type = attach_type;
-	attr.insns = prog;
-	attr.insns_cnt = probe_prog_length(attr.insns);
-	attr.license = "GPL";
-	attr.log_level = 2;
-
-	ret = bpf_load_program_xattr(&attr, bpf_log_buf, BPF_LOG_BUF_SIZE);
+	LIBBPF_OPTS(bpf_prog_load_opts, opts);
+	int ret, insn_cnt;
+
+	insn_cnt = probe_prog_length(prog);
+
+	opts.expected_attach_type = attach_type;
+	opts.log_buf = bpf_log_buf;
+	opts.log_size = BPF_LOG_BUF_SIZE;
+	opts.log_level = 2;
+
+	ret = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK, NULL, "GPL", prog, insn_cnt, &opts);
 	if (verbose && ret < 0)
 		fprintf(stderr, "%s\n", bpf_log_buf);
 
diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c
index aa3f185fcb89..05c9e4944c01 100644
--- a/tools/testing/selftests/bpf/test_sock_addr.c
+++ b/tools/testing/selftests/bpf/test_sock_addr.c
@@ -645,17 +645,14 @@ static int mk_sockaddr(int domain, const char *ip, unsigned short port,
 static int load_insns(const struct sock_addr_test *test,
 		      const struct bpf_insn *insns, size_t insns_cnt)
 {
-	struct bpf_load_program_attr load_attr;
+	LIBBPF_OPTS(bpf_prog_load_opts, opts);
 	int ret;
 
-	memset(&load_attr, 0, sizeof(struct bpf_load_program_attr));
-	load_attr.prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
-	load_attr.expected_attach_type = test->expected_attach_type;
-	load_attr.insns = insns;
-	load_attr.insns_cnt = insns_cnt;
-	load_attr.license = "GPL";
+	opts.expected_attach_type = test->expected_attach_type;
+	opts.log_buf = bpf_log_buf;
+	opts.log_size = BPF_LOG_BUF_SIZE;
 
-	ret = bpf_load_program_xattr(&load_attr, bpf_log_buf, BPF_LOG_BUF_SIZE);
+	ret = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, NULL, "GPL", insns, insns_cnt, &opts);
 	if (ret < 0 && test->expected_result != LOAD_REJECT) {
 		log_err(">>> Loading program error.\n"
 			">>> Verifier output:\n%s\n-------\n", bpf_log_buf);
diff --git a/tools/testing/selftests/bpf/test_sysctl.c b/tools/testing/selftests/bpf/test_sysctl.c
index a3bb6d399daa..4a395d7a8ea9 100644
--- a/tools/testing/selftests/bpf/test_sysctl.c
+++ b/tools/testing/selftests/bpf/test_sysctl.c
@@ -1435,14 +1435,10 @@ static int load_sysctl_prog_insns(struct sysctl_test *test,
 				  const char *sysctl_path)
 {
 	struct bpf_insn *prog = test->insns;
-	struct bpf_load_program_attr attr;
-	int ret;
+	LIBBPF_OPTS(bpf_prog_load_opts, opts);
+	int ret, insn_cnt;
 
-	memset(&attr, 0, sizeof(struct bpf_load_program_attr));
-	attr.prog_type = BPF_PROG_TYPE_CGROUP_SYSCTL;
-	attr.insns = prog;
-	attr.insns_cnt = probe_prog_length(attr.insns);
-	attr.license = "GPL";
+	insn_cnt = probe_prog_length(prog);
 
 	if (test->fixup_value_insn) {
 		char buf[128];
@@ -1465,7 +1461,10 @@ static int load_sysctl_prog_insns(struct sysctl_test *test,
 			return -1;
 	}
 
-	ret = bpf_load_program_xattr(&attr, bpf_log_buf, BPF_LOG_BUF_SIZE);
+	opts.log_buf = bpf_log_buf;
+	opts.log_size = BPF_LOG_BUF_SIZE;
+
+	ret = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SYSCTL, NULL, "GPL", prog, insn_cnt, &opts);
 	if (ret < 0 && test->result != LOAD_REJECT) {
 		log_err(">>> Loading program error.\n"
 			">>> Verifier output:\n%s\n-------\n", bpf_log_buf);
@@ -1476,15 +1475,10 @@ static int load_sysctl_prog_insns(struct sysctl_test *test,
 
 static int load_sysctl_prog_file(struct sysctl_test *test)
 {
-	struct bpf_prog_load_attr attr;
 	struct bpf_object *obj;
 	int prog_fd;
 
-	memset(&attr, 0, sizeof(struct bpf_prog_load_attr));
-	attr.file = test->prog_file;
-	attr.prog_type = BPF_PROG_TYPE_CGROUP_SYSCTL;
-
-	if (bpf_prog_load_xattr(&attr, &obj, &prog_fd)) {
+	if (bpf_prog_test_load(test->prog_file, BPF_PROG_TYPE_CGROUP_SYSCTL, &obj, &prog_fd)) {
 		if (test->result != LOAD_REJECT)
 			log_err(">>> Loading program (%s) error.\n",
 				test->prog_file);
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 25afe423b3f0..e512b715a785 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -498,8 +498,7 @@ static int create_prog_dummy_simple(enum bpf_prog_type prog_type, int ret)
 		BPF_EXIT_INSN(),
 	};
 
-	return bpf_load_program(prog_type, prog,
-				ARRAY_SIZE(prog), "GPL", 0, NULL, 0);
+	return bpf_prog_load(prog_type, NULL, "GPL", prog, ARRAY_SIZE(prog), NULL);
 }
 
 static int create_prog_dummy_loop(enum bpf_prog_type prog_type, int mfd,
@@ -514,8 +513,7 @@ static int create_prog_dummy_loop(enum bpf_prog_type prog_type, int mfd,
 		BPF_EXIT_INSN(),
 	};
 
-	return bpf_load_program(prog_type, prog,
-				ARRAY_SIZE(prog), "GPL", 0, NULL, 0);
+	return bpf_prog_load(prog_type, NULL, "GPL", prog, ARRAY_SIZE(prog), NULL);
 }
 
 static int create_prog_array(enum bpf_prog_type prog_type, uint32_t max_elem,
@@ -1045,7 +1043,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 	int fd_prog, expected_ret, alignment_prevented_execution;
 	int prog_len, prog_type = test->prog_type;
 	struct bpf_insn *prog = test->insns;
-	struct bpf_load_program_attr attr;
+	LIBBPF_OPTS(bpf_prog_load_opts, opts);
 	int run_errs, run_successes;
 	int map_fds[MAX_NR_MAPS];
 	const char *expected_err;
@@ -1085,32 +1083,34 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 		       test->result_unpriv : test->result;
 	expected_err = unpriv && test->errstr_unpriv ?
 		       test->errstr_unpriv : test->errstr;
-	memset(&attr, 0, sizeof(attr));
-	attr.prog_type = prog_type;
-	attr.expected_attach_type = test->expected_attach_type;
-	attr.insns = prog;
-	attr.insns_cnt = prog_len;
-	attr.license = "GPL";
+
+	opts.expected_attach_type = test->expected_attach_type;
 	if (verbose)
-		attr.log_level = 1;
+		opts.log_level = 1;
 	else if (expected_ret == VERBOSE_ACCEPT)
-		attr.log_level = 2;
+		opts.log_level = 2;
 	else
-		attr.log_level = 4;
-	attr.prog_flags = pflags;
+		opts.log_level = 4;
+	opts.prog_flags = pflags;
 
 	if (prog_type == BPF_PROG_TYPE_TRACING && test->kfunc) {
-		attr.attach_btf_id = libbpf_find_vmlinux_btf_id(test->kfunc,
-						attr.expected_attach_type);
-		if (attr.attach_btf_id < 0) {
+		int attach_btf_id;
+
+		attach_btf_id = libbpf_find_vmlinux_btf_id(test->kfunc,
+						opts.expected_attach_type);
+		if (attach_btf_id < 0) {
 			printf("FAIL\nFailed to find BTF ID for '%s'!\n",
 				test->kfunc);
 			(*errors)++;
 			return;
 		}
+
+		opts.attach_btf_id = attach_btf_id;
 	}
 
-	fd_prog = bpf_load_program_xattr(&attr, bpf_vlog, sizeof(bpf_vlog));
+	opts.log_buf = bpf_vlog;
+	opts.log_size = sizeof(bpf_vlog);
+	fd_prog = bpf_prog_load(prog_type, NULL, "GPL", prog, prog_len, &opts);
 	saved_errno = errno;
 
 	/* BPF_PROG_TYPE_TRACING requires more setup and
-- 
cgit v1.2.3


From f87c1930ac2951d7fb3bacb523c24046c81015ed Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 3 Nov 2021 15:08:43 -0700
Subject: selftests/bpf: Merge test_stub.c into testing_helpers.c

Move testing prog and object load wrappers (bpf_prog_test_load and
bpf_test_load_program) into testing_helpers.{c,h} and get rid of
otherwise useless test_stub.c. Make testing_helpers.c available to
non-test_progs binaries as well.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/bpf/20211103220845.2676888-11-andrii@kernel.org
---
 tools/testing/selftests/bpf/Makefile          | 33 ++++++++--------
 tools/testing/selftests/bpf/test_stub.c       | 44 ---------------------
 tools/testing/selftests/bpf/testing_helpers.c | 55 +++++++++++++++++++++++++++
 tools/testing/selftests/bpf/testing_helpers.h |  6 +++
 4 files changed, 78 insertions(+), 60 deletions(-)
 delete mode 100644 tools/testing/selftests/bpf/test_stub.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index c4497a4af3fe..5588c622d266 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -178,10 +178,6 @@ $(OUTPUT)/bpf_testmod.ko: $(VMLINUX_BTF) $(wildcard bpf_testmod/Makefile bpf_tes
 	$(Q)$(MAKE) $(submake_extras) -C bpf_testmod
 	$(Q)cp bpf_testmod/bpf_testmod.ko $@
 
-$(OUTPUT)/test_stub.o: test_stub.c $(BPFOBJ)
-	$(call msg,CC,,$@)
-	$(Q)$(CC) -c $(CFLAGS) -o $@ $<
-
 DEFAULT_BPFTOOL := $(HOST_SCRATCH_DIR)/sbin/bpftool
 
 $(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL) $(RUNQSLOWER_OUTPUT)
@@ -194,18 +190,23 @@ $(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL) $(RUNQSLOWER_OUTPUT)
 
 TEST_GEN_PROGS_EXTENDED += $(DEFAULT_BPFTOOL)
 
-$(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/test_stub.o $(BPFOBJ)
-
-$(OUTPUT)/test_dev_cgroup: cgroup_helpers.c
-$(OUTPUT)/test_skb_cgroup_id_user: cgroup_helpers.c
-$(OUTPUT)/test_sock: cgroup_helpers.c
-$(OUTPUT)/test_sock_addr: cgroup_helpers.c
-$(OUTPUT)/test_sockmap: cgroup_helpers.c
-$(OUTPUT)/test_tcpnotify_user: cgroup_helpers.c trace_helpers.c
-$(OUTPUT)/get_cgroup_id_user: cgroup_helpers.c
-$(OUTPUT)/test_cgroup_storage: cgroup_helpers.c
-$(OUTPUT)/test_sock_fields: cgroup_helpers.c
-$(OUTPUT)/test_sysctl: cgroup_helpers.c
+$(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): $(BPFOBJ)
+
+$(OUTPUT)/test_dev_cgroup: cgroup_helpers.c testing_helpers.o
+$(OUTPUT)/test_skb_cgroup_id_user: cgroup_helpers.c testing_helpers.o
+$(OUTPUT)/test_sock: cgroup_helpers.c testing_helpers.o
+$(OUTPUT)/test_sock_addr: cgroup_helpers.c testing_helpers.o
+$(OUTPUT)/test_sockmap: cgroup_helpers.c testing_helpers.o
+$(OUTPUT)/test_tcpnotify_user: cgroup_helpers.c trace_helpers.c testing_helpers.o
+$(OUTPUT)/get_cgroup_id_user: cgroup_helpers.c testing_helpers.o
+$(OUTPUT)/test_cgroup_storage: cgroup_helpers.c testing_helpers.o
+$(OUTPUT)/test_sock_fields: cgroup_helpers.c testing_helpers.o
+$(OUTPUT)/test_sysctl: cgroup_helpers.c testing_helpers.o
+$(OUTPUT)/test_tag: testing_helpers.o
+$(OUTPUT)/test_lirc_mode2_user: testing_helpers.o
+$(OUTPUT)/xdping: testing_helpers.o
+$(OUTPUT)/flow_dissector_load: testing_helpers.o
+$(OUTPUT)/test_maps: testing_helpers.o
 
 BPFTOOL ?= $(DEFAULT_BPFTOOL)
 $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile)    \
diff --git a/tools/testing/selftests/bpf/test_stub.c b/tools/testing/selftests/bpf/test_stub.c
deleted file mode 100644
index 47e132726203..000000000000
--- a/tools/testing/selftests/bpf/test_stub.c
+++ /dev/null
@@ -1,44 +0,0 @@
-// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
-/* Copyright (C) 2019 Netronome Systems, Inc. */
-
-#include <bpf/bpf.h>
-#include <bpf/libbpf.h>
-#include <string.h>
-
-int extra_prog_load_log_flags = 0;
-
-int bpf_prog_test_load(const char *file, enum bpf_prog_type type,
-		       struct bpf_object **pobj, int *prog_fd)
-{
-	struct bpf_prog_load_attr attr;
-
-	memset(&attr, 0, sizeof(struct bpf_prog_load_attr));
-	attr.file = file;
-	attr.prog_type = type;
-	attr.expected_attach_type = 0;
-	attr.prog_flags = BPF_F_TEST_RND_HI32;
-	attr.log_level = extra_prog_load_log_flags;
-
-	return bpf_prog_load_xattr(&attr, pobj, prog_fd);
-}
-
-int bpf_test_load_program(enum bpf_prog_type type, const struct bpf_insn *insns,
-			  size_t insns_cnt, const char *license,
-			  __u32 kern_version, char *log_buf,
-		     size_t log_buf_sz)
-{
-	struct bpf_load_program_attr load_attr;
-
-	memset(&load_attr, 0, sizeof(struct bpf_load_program_attr));
-	load_attr.prog_type = type;
-	load_attr.expected_attach_type = 0;
-	load_attr.name = NULL;
-	load_attr.insns = insns;
-	load_attr.insns_cnt = insns_cnt;
-	load_attr.license = license;
-	load_attr.kern_version = kern_version;
-	load_attr.prog_flags = BPF_F_TEST_RND_HI32;
-	load_attr.log_level = extra_prog_load_log_flags;
-
-	return bpf_load_program_xattr(&load_attr, log_buf, log_buf_sz);
-}
diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c
index 800d503e5cb4..ef61d43adfe4 100644
--- a/tools/testing/selftests/bpf/testing_helpers.c
+++ b/tools/testing/selftests/bpf/testing_helpers.c
@@ -1,7 +1,11 @@
 // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+/* Copyright (C) 2019 Netronome Systems, Inc. */
 /* Copyright (C) 2020 Facebook, Inc. */
 #include <stdlib.h>
+#include <string.h>
 #include <errno.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
 #include "testing_helpers.h"
 
 int parse_num_list(const char *s, bool **num_set, int *num_set_len)
@@ -78,3 +82,54 @@ __u32 link_info_prog_id(const struct bpf_link *link, struct bpf_link_info *info)
 	}
 	return info->prog_id;
 }
+
+int extra_prog_load_log_flags = 0;
+
+int bpf_prog_test_load(const char *file, enum bpf_prog_type type,
+		       struct bpf_object **pobj, int *prog_fd)
+{
+	struct bpf_object *obj;
+	struct bpf_program *prog;
+	int err;
+
+	obj = bpf_object__open(file);
+	if (!obj)
+		return -errno;
+
+	prog = bpf_object__next_program(obj, NULL);
+	if (!prog) {
+		err = -ENOENT;
+		goto err_out;
+	}
+
+	if (type != BPF_PROG_TYPE_UNSPEC)
+		bpf_program__set_type(prog, type);
+
+	err = bpf_object__load(obj);
+	if (err)
+		goto err_out;
+
+	*pobj = obj;
+	*prog_fd = bpf_program__fd(prog);
+
+	return 0;
+err_out:
+	bpf_object__close(obj);
+	return err;
+}
+
+int bpf_test_load_program(enum bpf_prog_type type, const struct bpf_insn *insns,
+			  size_t insns_cnt, const char *license,
+			  __u32 kern_version, char *log_buf,
+			  size_t log_buf_sz)
+{
+	LIBBPF_OPTS(bpf_prog_load_opts, opts,
+		.kern_version = kern_version,
+		.prog_flags = BPF_F_TEST_RND_HI32,
+		.log_level = extra_prog_load_log_flags,
+		.log_buf = log_buf,
+		.log_size = log_buf_sz,
+	);
+
+	return bpf_prog_load(type, NULL, license, insns, insns_cnt, &opts);
+}
diff --git a/tools/testing/selftests/bpf/testing_helpers.h b/tools/testing/selftests/bpf/testing_helpers.h
index d4f8e749611b..f46ebc476ee8 100644
--- a/tools/testing/selftests/bpf/testing_helpers.h
+++ b/tools/testing/selftests/bpf/testing_helpers.h
@@ -6,3 +6,9 @@
 
 int parse_num_list(const char *s, bool **set, int *set_len);
 __u32 link_info_prog_id(const struct bpf_link *link, struct bpf_link_info *info);
+int bpf_prog_test_load(const char *file, enum bpf_prog_type type,
+		       struct bpf_object **pobj, int *prog_fd);
+int bpf_test_load_program(enum bpf_prog_type type, const struct bpf_insn *insns,
+			  size_t insns_cnt, const char *license,
+			  __u32 kern_version, char *log_buf,
+			  size_t log_buf_sz);
-- 
cgit v1.2.3


From cbdb1461dcf45765a036e9f6975ffe19e69bdc33 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 3 Nov 2021 15:08:44 -0700
Subject: selftests/bpf: Use explicit bpf_prog_test_load() calls everywhere

-Dbpf_prog_load_deprecated=bpf_prog_test_load trick is both ugly and
breaks when deprecation goes into effect due to macro magic. Convert all
the uses to explicit bpf_prog_test_load() calls which avoid deprecation
errors and makes everything less magical.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/bpf/20211103220845.2676888-12-andrii@kernel.org
---
 tools/testing/selftests/bpf/Makefile                   |  2 +-
 tools/testing/selftests/bpf/flow_dissector_load.h      |  3 ++-
 tools/testing/selftests/bpf/get_cgroup_id_user.c       |  5 +++--
 tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c    |  2 +-
 tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c |  8 ++++----
 .../selftests/bpf/prog_tests/get_stack_raw_tp.c        |  4 ++--
 tools/testing/selftests/bpf/prog_tests/global_data.c   |  2 +-
 .../selftests/bpf/prog_tests/global_func_args.c        |  2 +-
 tools/testing/selftests/bpf/prog_tests/kfree_skb.c     |  2 +-
 tools/testing/selftests/bpf/prog_tests/l4lb_all.c      |  2 +-
 .../selftests/bpf/prog_tests/load_bytes_relative.c     |  2 +-
 tools/testing/selftests/bpf/prog_tests/map_lock.c      |  4 ++--
 tools/testing/selftests/bpf/prog_tests/pkt_access.c    |  2 +-
 tools/testing/selftests/bpf/prog_tests/pkt_md_access.c |  2 +-
 .../testing/selftests/bpf/prog_tests/queue_stack_map.c |  2 +-
 tools/testing/selftests/bpf/prog_tests/skb_ctx.c       |  2 +-
 tools/testing/selftests/bpf/prog_tests/skb_helpers.c   |  2 +-
 tools/testing/selftests/bpf/prog_tests/spinlock.c      |  4 ++--
 .../testing/selftests/bpf/prog_tests/stacktrace_map.c  |  2 +-
 .../selftests/bpf/prog_tests/stacktrace_map_raw_tp.c   |  2 +-
 tools/testing/selftests/bpf/prog_tests/tailcalls.c     | 18 +++++++++---------
 .../selftests/bpf/prog_tests/task_fd_query_rawtp.c     |  2 +-
 .../selftests/bpf/prog_tests/task_fd_query_tp.c        |  4 ++--
 tools/testing/selftests/bpf/prog_tests/tcp_estats.c    |  2 +-
 .../testing/selftests/bpf/prog_tests/tp_attach_query.c |  2 +-
 tools/testing/selftests/bpf/prog_tests/xdp.c           |  2 +-
 .../testing/selftests/bpf/prog_tests/xdp_adjust_tail.c |  6 +++---
 tools/testing/selftests/bpf/prog_tests/xdp_attach.c    |  6 +++---
 tools/testing/selftests/bpf/prog_tests/xdp_info.c      |  2 +-
 tools/testing/selftests/bpf/prog_tests/xdp_perf.c      |  2 +-
 tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c      |  2 +-
 tools/testing/selftests/bpf/test_dev_cgroup.c          |  3 ++-
 tools/testing/selftests/bpf/test_lirc_mode2_user.c     |  6 ++++--
 tools/testing/selftests/bpf/test_maps.c                |  7 ++++---
 tools/testing/selftests/bpf/test_sysctl.c              |  1 +
 tools/testing/selftests/bpf/test_tcpnotify_user.c      |  3 ++-
 tools/testing/selftests/bpf/xdping.c                   |  3 ++-
 37 files changed, 68 insertions(+), 59 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 5588c622d266..2016c583ed20 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -24,7 +24,6 @@ SAN_CFLAGS	?=
 CFLAGS += -g -O0 -rdynamic -Wall $(GENFLAGS) $(SAN_CFLAGS)		\
 	  -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR)		\
 	  -I$(TOOLSINCDIR) -I$(APIDIR) -I$(OUTPUT)			\
-	  -Dbpf_prog_load_deprecated=bpf_prog_test_load			\
 	  -Dbpf_load_program=bpf_test_load_program
 LDLIBS += -lcap -lelf -lz -lrt -lpthread
 
@@ -207,6 +206,7 @@ $(OUTPUT)/test_lirc_mode2_user: testing_helpers.o
 $(OUTPUT)/xdping: testing_helpers.o
 $(OUTPUT)/flow_dissector_load: testing_helpers.o
 $(OUTPUT)/test_maps: testing_helpers.o
+$(OUTPUT)/test_verifier: testing_helpers.o
 
 BPFTOOL ?= $(DEFAULT_BPFTOOL)
 $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile)    \
diff --git a/tools/testing/selftests/bpf/flow_dissector_load.h b/tools/testing/selftests/bpf/flow_dissector_load.h
index 9d0acc2fc6cc..f40b585f4e7e 100644
--- a/tools/testing/selftests/bpf/flow_dissector_load.h
+++ b/tools/testing/selftests/bpf/flow_dissector_load.h
@@ -4,6 +4,7 @@
 
 #include <bpf/bpf.h>
 #include <bpf/libbpf.h>
+#include "testing_helpers.h"
 
 static inline int bpf_flow_load(struct bpf_object **obj,
 				const char *path,
@@ -18,7 +19,7 @@ static inline int bpf_flow_load(struct bpf_object **obj,
 	int prog_array_fd;
 	int ret, fd, i;
 
-	ret = bpf_prog_load(path, BPF_PROG_TYPE_FLOW_DISSECTOR, obj,
+	ret = bpf_prog_test_load(path, BPF_PROG_TYPE_FLOW_DISSECTOR, obj,
 			    prog_fd);
 	if (ret)
 		return ret;
diff --git a/tools/testing/selftests/bpf/get_cgroup_id_user.c b/tools/testing/selftests/bpf/get_cgroup_id_user.c
index 99628e1a1e58..3a7b82bd9e94 100644
--- a/tools/testing/selftests/bpf/get_cgroup_id_user.c
+++ b/tools/testing/selftests/bpf/get_cgroup_id_user.c
@@ -19,6 +19,7 @@
 #include <bpf/libbpf.h>
 
 #include "cgroup_helpers.h"
+#include "testing_helpers.h"
 #include "bpf_rlimit.h"
 
 #define CHECK(condition, tag, format...) ({		\
@@ -66,8 +67,8 @@ int main(int argc, char **argv)
 	if (CHECK(cgroup_fd < 0, "cgroup_setup_and_join", "err %d errno %d\n", cgroup_fd, errno))
 		return 1;
 
-	err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
-	if (CHECK(err, "bpf_prog_load", "err %d errno %d\n", err, errno))
+	err = bpf_prog_test_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
+	if (CHECK(err, "bpf_prog_test_load", "err %d errno %d\n", err, errno))
 		goto cleanup_cgroup_env;
 
 	cgidmap_fd = bpf_find_map(__func__, obj, "cg_ids");
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c b/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c
index eb8eeebe6935..0a6c5f00abd4 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c
@@ -48,7 +48,7 @@ void serial_test_bpf_obj_id(void)
 	bzero(zeros, sizeof(zeros));
 	for (i = 0; i < nr_iters; i++) {
 		now = time(NULL);
-		err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT,
+		err = bpf_prog_test_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT,
 				    &objs[i], &prog_fds[i]);
 		/* test_obj_id.o is a dumb prog. It should never fail
 		 * to load.
diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c
index 9cff14a23bb7..fdd603ebda28 100644
--- a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c
+++ b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c
@@ -65,7 +65,7 @@ static void test_fexit_bpf2bpf_common(const char *obj_file,
 	int err, tgt_fd, i;
 	struct btf *btf;
 
-	err = bpf_prog_load(target_obj_file, BPF_PROG_TYPE_UNSPEC,
+	err = bpf_prog_test_load(target_obj_file, BPF_PROG_TYPE_UNSPEC,
 			    &tgt_obj, &tgt_fd);
 	if (!ASSERT_OK(err, "tgt_prog_load"))
 		return;
@@ -224,7 +224,7 @@ static int test_second_attach(struct bpf_object *obj)
 	if (CHECK(!prog, "find_prog", "prog %s not found\n", prog_name))
 		return -ENOENT;
 
-	err = bpf_prog_load(tgt_obj_file, BPF_PROG_TYPE_UNSPEC,
+	err = bpf_prog_test_load(tgt_obj_file, BPF_PROG_TYPE_UNSPEC,
 			    &tgt_obj, &tgt_fd);
 	if (CHECK(err, "second_prog_load", "file %s err %d errno %d\n",
 		  tgt_obj_file, err, errno))
@@ -274,7 +274,7 @@ static void test_fmod_ret_freplace(void)
 	__u32 duration = 0;
 	int err, pkt_fd, attach_prog_fd;
 
-	err = bpf_prog_load(tgt_name, BPF_PROG_TYPE_UNSPEC,
+	err = bpf_prog_test_load(tgt_name, BPF_PROG_TYPE_UNSPEC,
 			    &pkt_obj, &pkt_fd);
 	/* the target prog should load fine */
 	if (CHECK(err, "tgt_prog_load", "file %s err %d errno %d\n",
@@ -341,7 +341,7 @@ static void test_obj_load_failure_common(const char *obj_file,
 	int err, pkt_fd;
 	__u32 duration = 0;
 
-	err = bpf_prog_load(target_obj_file, BPF_PROG_TYPE_UNSPEC,
+	err = bpf_prog_test_load(target_obj_file, BPF_PROG_TYPE_UNSPEC,
 			    &pkt_obj, &pkt_fd);
 	/* the target prog should load fine */
 	if (CHECK(err, "tgt_prog_load", "file %s err %d errno %d\n",
diff --git a/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c
index 522237aa4470..569fcc6ed660 100644
--- a/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c
+++ b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c
@@ -94,11 +94,11 @@ void test_get_stack_raw_tp(void)
 	struct bpf_map *map;
 	cpu_set_t cpu_set;
 
-	err = bpf_prog_load(file_err, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd);
+	err = bpf_prog_test_load(file_err, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd);
 	if (CHECK(err >= 0, "prog_load raw tp", "err %d errno %d\n", err, errno))
 		return;
 
-	err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd);
+	err = bpf_prog_test_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd);
 	if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
 		return;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/global_data.c b/tools/testing/selftests/bpf/prog_tests/global_data.c
index afd8639f9a94..9da131b32e13 100644
--- a/tools/testing/selftests/bpf/prog_tests/global_data.c
+++ b/tools/testing/selftests/bpf/prog_tests/global_data.c
@@ -136,7 +136,7 @@ void test_global_data(void)
 	struct bpf_object *obj;
 	int err, prog_fd;
 
-	err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
+	err = bpf_prog_test_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
 	if (CHECK(err, "load program", "error %d loading %s\n", err, file))
 		return;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/global_func_args.c b/tools/testing/selftests/bpf/prog_tests/global_func_args.c
index 8bcc2869102f..93a2439237b0 100644
--- a/tools/testing/selftests/bpf/prog_tests/global_func_args.c
+++ b/tools/testing/selftests/bpf/prog_tests/global_func_args.c
@@ -44,7 +44,7 @@ void test_global_func_args(void)
 	struct bpf_object *obj;
 	int err, prog_fd;
 
-	err = bpf_prog_load(file, BPF_PROG_TYPE_CGROUP_SKB, &obj, &prog_fd);
+	err = bpf_prog_test_load(file, BPF_PROG_TYPE_CGROUP_SKB, &obj, &prog_fd);
 	if (CHECK(err, "load program", "error %d loading %s\n", err, file))
 		return;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c
index 01e51d16c8b8..885413ed5c96 100644
--- a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c
+++ b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c
@@ -74,7 +74,7 @@ void serial_test_kfree_skb(void)
 	const int zero = 0;
 	bool test_ok[2];
 
-	err = bpf_prog_load("./test_pkt_access.o", BPF_PROG_TYPE_SCHED_CLS,
+	err = bpf_prog_test_load("./test_pkt_access.o", BPF_PROG_TYPE_SCHED_CLS,
 			    &obj, &tattr.prog_fd);
 	if (CHECK(err, "prog_load sched cls", "err %d errno %d\n", err, errno))
 		return;
diff --git a/tools/testing/selftests/bpf/prog_tests/l4lb_all.c b/tools/testing/selftests/bpf/prog_tests/l4lb_all.c
index 8073105548ff..540ef28fabff 100644
--- a/tools/testing/selftests/bpf/prog_tests/l4lb_all.c
+++ b/tools/testing/selftests/bpf/prog_tests/l4lb_all.c
@@ -30,7 +30,7 @@ static void test_l4lb(const char *file)
 	char buf[128];
 	u32 *magic = (u32 *)buf;
 
-	err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
+	err = bpf_prog_test_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
 	if (CHECK_FAIL(err))
 		return;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/load_bytes_relative.c b/tools/testing/selftests/bpf/prog_tests/load_bytes_relative.c
index 5a2a689dbb68..4e0b2ec057aa 100644
--- a/tools/testing/selftests/bpf/prog_tests/load_bytes_relative.c
+++ b/tools/testing/selftests/bpf/prog_tests/load_bytes_relative.c
@@ -27,7 +27,7 @@ void test_load_bytes_relative(void)
 	if (CHECK_FAIL(server_fd < 0))
 		goto close_cgroup_fd;
 
-	err = bpf_prog_load("./load_bytes_relative.o", BPF_PROG_TYPE_CGROUP_SKB,
+	err = bpf_prog_test_load("./load_bytes_relative.o", BPF_PROG_TYPE_CGROUP_SKB,
 			    &obj, &prog_fd);
 	if (CHECK_FAIL(err))
 		goto close_server_fd;
diff --git a/tools/testing/selftests/bpf/prog_tests/map_lock.c b/tools/testing/selftests/bpf/prog_tests/map_lock.c
index ce17b1ed8709..23d19e9cf26a 100644
--- a/tools/testing/selftests/bpf/prog_tests/map_lock.c
+++ b/tools/testing/selftests/bpf/prog_tests/map_lock.c
@@ -53,9 +53,9 @@ void test_map_lock(void)
 	int err = 0, key = 0, i;
 	void *ret;
 
-	err = bpf_prog_load(file, BPF_PROG_TYPE_CGROUP_SKB, &obj, &prog_fd);
+	err = bpf_prog_test_load(file, BPF_PROG_TYPE_CGROUP_SKB, &obj, &prog_fd);
 	if (CHECK_FAIL(err)) {
-		printf("test_map_lock:bpf_prog_load errno %d\n", errno);
+		printf("test_map_lock:bpf_prog_test_load errno %d\n", errno);
 		goto close_prog;
 	}
 	map_fd[0] = bpf_find_map(__func__, obj, "hash_map");
diff --git a/tools/testing/selftests/bpf/prog_tests/pkt_access.c b/tools/testing/selftests/bpf/prog_tests/pkt_access.c
index 44b514fabccd..6628710ec3c6 100644
--- a/tools/testing/selftests/bpf/prog_tests/pkt_access.c
+++ b/tools/testing/selftests/bpf/prog_tests/pkt_access.c
@@ -9,7 +9,7 @@ void test_pkt_access(void)
 	__u32 duration, retval;
 	int err, prog_fd;
 
-	err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
+	err = bpf_prog_test_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
 	if (CHECK_FAIL(err))
 		return;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c b/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c
index 939015cd6dba..c9d2d6a1bfcc 100644
--- a/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c
+++ b/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c
@@ -9,7 +9,7 @@ void test_pkt_md_access(void)
 	__u32 duration, retval;
 	int err, prog_fd;
 
-	err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
+	err = bpf_prog_test_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
 	if (CHECK_FAIL(err))
 		return;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c b/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c
index f47e7b1cb32c..8ccba3ab70ee 100644
--- a/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c
+++ b/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c
@@ -27,7 +27,7 @@ static void test_queue_stack_map_by_type(int type)
 	else
 		return;
 
-	err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
+	err = bpf_prog_test_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
 	if (CHECK_FAIL(err))
 		return;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/skb_ctx.c b/tools/testing/selftests/bpf/prog_tests/skb_ctx.c
index c437e6ba8fe2..d3106078838c 100644
--- a/tools/testing/selftests/bpf/prog_tests/skb_ctx.c
+++ b/tools/testing/selftests/bpf/prog_tests/skb_ctx.c
@@ -32,7 +32,7 @@ void test_skb_ctx(void)
 	int err;
 	int i;
 
-	err = bpf_prog_load("./test_skb_ctx.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
+	err = bpf_prog_test_load("./test_skb_ctx.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
 			    &tattr.prog_fd);
 	if (CHECK_ATTR(err, "load", "err %d errno %d\n", err, errno))
 		return;
diff --git a/tools/testing/selftests/bpf/prog_tests/skb_helpers.c b/tools/testing/selftests/bpf/prog_tests/skb_helpers.c
index f302ad84a298..6f802a1c0800 100644
--- a/tools/testing/selftests/bpf/prog_tests/skb_helpers.c
+++ b/tools/testing/selftests/bpf/prog_tests/skb_helpers.c
@@ -20,7 +20,7 @@ void test_skb_helpers(void)
 	struct bpf_object *obj;
 	int err;
 
-	err = bpf_prog_load("./test_skb_helpers.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
+	err = bpf_prog_test_load("./test_skb_helpers.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
 			    &tattr.prog_fd);
 	if (CHECK_ATTR(err, "load", "err %d errno %d\n", err, errno))
 		return;
diff --git a/tools/testing/selftests/bpf/prog_tests/spinlock.c b/tools/testing/selftests/bpf/prog_tests/spinlock.c
index 7577a77a4c4c..6307f5d2b417 100644
--- a/tools/testing/selftests/bpf/prog_tests/spinlock.c
+++ b/tools/testing/selftests/bpf/prog_tests/spinlock.c
@@ -24,9 +24,9 @@ void test_spinlock(void)
 	int err = 0, i;
 	void *ret;
 
-	err = bpf_prog_load(file, BPF_PROG_TYPE_CGROUP_SKB, &obj, &prog_fd);
+	err = bpf_prog_test_load(file, BPF_PROG_TYPE_CGROUP_SKB, &obj, &prog_fd);
 	if (CHECK_FAIL(err)) {
-		printf("test_spin_lock:bpf_prog_load errno %d\n", errno);
+		printf("test_spin_lock:bpf_prog_test_load errno %d\n", errno);
 		goto close_prog;
 	}
 	for (i = 0; i < 4; i++)
diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c
index 04b476bd62b9..337493d74ec5 100644
--- a/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c
+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c
@@ -12,7 +12,7 @@ void test_stacktrace_map(void)
 	struct bpf_object *obj;
 	struct bpf_link *link;
 
-	err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
+	err = bpf_prog_test_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
 	if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
 		return;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c
index 4fd30bb651ad..063a14a2060d 100644
--- a/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c
+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c
@@ -12,7 +12,7 @@ void test_stacktrace_map_raw_tp(void)
 	struct bpf_object *obj;
 	struct bpf_link *link = NULL;
 
-	err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd);
+	err = bpf_prog_test_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd);
 	if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
 		return;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c
index 9825f1f7bfcc..5dc0f425bd11 100644
--- a/tools/testing/selftests/bpf/prog_tests/tailcalls.c
+++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c
@@ -16,7 +16,7 @@ static void test_tailcall_1(void)
 	char prog_name[32];
 	char buff[128] = {};
 
-	err = bpf_prog_load("tailcall1.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
+	err = bpf_prog_test_load("tailcall1.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
 			    &prog_fd);
 	if (CHECK_FAIL(err))
 		return;
@@ -154,7 +154,7 @@ static void test_tailcall_2(void)
 	char prog_name[32];
 	char buff[128] = {};
 
-	err = bpf_prog_load("tailcall2.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
+	err = bpf_prog_test_load("tailcall2.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
 			    &prog_fd);
 	if (CHECK_FAIL(err))
 		return;
@@ -228,7 +228,7 @@ static void test_tailcall_count(const char *which)
 	__u32 retval, duration;
 	char buff[128] = {};
 
-	err = bpf_prog_load(which, BPF_PROG_TYPE_SCHED_CLS, &obj,
+	err = bpf_prog_test_load(which, BPF_PROG_TYPE_SCHED_CLS, &obj,
 			    &prog_fd);
 	if (CHECK_FAIL(err))
 		return;
@@ -324,7 +324,7 @@ static void test_tailcall_4(void)
 	char buff[128] = {};
 	char prog_name[32];
 
-	err = bpf_prog_load("tailcall4.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
+	err = bpf_prog_test_load("tailcall4.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
 			    &prog_fd);
 	if (CHECK_FAIL(err))
 		return;
@@ -412,7 +412,7 @@ static void test_tailcall_5(void)
 	char buff[128] = {};
 	char prog_name[32];
 
-	err = bpf_prog_load("tailcall5.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
+	err = bpf_prog_test_load("tailcall5.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
 			    &prog_fd);
 	if (CHECK_FAIL(err))
 		return;
@@ -498,7 +498,7 @@ static void test_tailcall_bpf2bpf_1(void)
 	__u32 retval, duration;
 	char prog_name[32];
 
-	err = bpf_prog_load("tailcall_bpf2bpf1.o", BPF_PROG_TYPE_SCHED_CLS,
+	err = bpf_prog_test_load("tailcall_bpf2bpf1.o", BPF_PROG_TYPE_SCHED_CLS,
 			    &obj, &prog_fd);
 	if (CHECK_FAIL(err))
 		return;
@@ -582,7 +582,7 @@ static void test_tailcall_bpf2bpf_2(void)
 	__u32 retval, duration;
 	char buff[128] = {};
 
-	err = bpf_prog_load("tailcall_bpf2bpf2.o", BPF_PROG_TYPE_SCHED_CLS,
+	err = bpf_prog_test_load("tailcall_bpf2bpf2.o", BPF_PROG_TYPE_SCHED_CLS,
 			    &obj, &prog_fd);
 	if (CHECK_FAIL(err))
 		return;
@@ -660,7 +660,7 @@ static void test_tailcall_bpf2bpf_3(void)
 	__u32 retval, duration;
 	char prog_name[32];
 
-	err = bpf_prog_load("tailcall_bpf2bpf3.o", BPF_PROG_TYPE_SCHED_CLS,
+	err = bpf_prog_test_load("tailcall_bpf2bpf3.o", BPF_PROG_TYPE_SCHED_CLS,
 			    &obj, &prog_fd);
 	if (CHECK_FAIL(err))
 		return;
@@ -757,7 +757,7 @@ static void test_tailcall_bpf2bpf_4(bool noise)
 	__u32 retval, duration;
 	char prog_name[32];
 
-	err = bpf_prog_load("tailcall_bpf2bpf4.o", BPF_PROG_TYPE_SCHED_CLS,
+	err = bpf_prog_test_load("tailcall_bpf2bpf4.o", BPF_PROG_TYPE_SCHED_CLS,
 			    &obj, &prog_fd);
 	if (CHECK_FAIL(err))
 		return;
diff --git a/tools/testing/selftests/bpf/prog_tests/task_fd_query_rawtp.c b/tools/testing/selftests/bpf/prog_tests/task_fd_query_rawtp.c
index 1bdc1d86a50c..17947c9e1d66 100644
--- a/tools/testing/selftests/bpf/prog_tests/task_fd_query_rawtp.c
+++ b/tools/testing/selftests/bpf/prog_tests/task_fd_query_rawtp.c
@@ -11,7 +11,7 @@ void test_task_fd_query_rawtp(void)
 	__u32 duration = 0;
 	char buf[256];
 
-	err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd);
+	err = bpf_prog_test_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd);
 	if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
 		return;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/task_fd_query_tp.c b/tools/testing/selftests/bpf/prog_tests/task_fd_query_tp.c
index 3f131b8fe328..c2a98a7a8dfc 100644
--- a/tools/testing/selftests/bpf/prog_tests/task_fd_query_tp.c
+++ b/tools/testing/selftests/bpf/prog_tests/task_fd_query_tp.c
@@ -13,8 +13,8 @@ static void test_task_fd_query_tp_core(const char *probe_name,
 	__u32 duration = 0;
 	char buf[256];
 
-	err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
-	if (CHECK(err, "bpf_prog_load", "err %d errno %d\n", err, errno))
+	err = bpf_prog_test_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
+	if (CHECK(err, "bpf_prog_test_load", "err %d errno %d\n", err, errno))
 		goto close_prog;
 
 	snprintf(buf, sizeof(buf),
diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_estats.c b/tools/testing/selftests/bpf/prog_tests/tcp_estats.c
index 594307dffd13..11bf755be4c9 100644
--- a/tools/testing/selftests/bpf/prog_tests/tcp_estats.c
+++ b/tools/testing/selftests/bpf/prog_tests/tcp_estats.c
@@ -8,7 +8,7 @@ void test_tcp_estats(void)
 	struct bpf_object *obj;
 	__u32 duration = 0;
 
-	err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
+	err = bpf_prog_test_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
 	CHECK(err, "", "err %d errno %d\n", err, errno);
 	if (err)
 		return;
diff --git a/tools/testing/selftests/bpf/prog_tests/tp_attach_query.c b/tools/testing/selftests/bpf/prog_tests/tp_attach_query.c
index 8652d0a46c87..39e79291c82b 100644
--- a/tools/testing/selftests/bpf/prog_tests/tp_attach_query.c
+++ b/tools/testing/selftests/bpf/prog_tests/tp_attach_query.c
@@ -35,7 +35,7 @@ void serial_test_tp_attach_query(void)
 
 	query = malloc(sizeof(*query) + sizeof(__u32) * num_progs);
 	for (i = 0; i < num_progs; i++) {
-		err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj[i],
+		err = bpf_prog_test_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj[i],
 				    &prog_fd[i]);
 		if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
 			goto cleanup1;
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp.c b/tools/testing/selftests/bpf/prog_tests/xdp.c
index 48921ff74850..7a7ef9d4e151 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp.c
@@ -16,7 +16,7 @@ void test_xdp(void)
 	__u32 duration, retval, size;
 	int err, prog_fd, map_fd;
 
-	err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
+	err = bpf_prog_test_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
 	if (CHECK_FAIL(err))
 		return;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c
index f529e3c923ae..3f5a17c38be5 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c
@@ -10,7 +10,7 @@ static void test_xdp_adjust_tail_shrink(void)
 	int err, prog_fd;
 	char buf[128];
 
-	err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
+	err = bpf_prog_test_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
 	if (CHECK_FAIL(err))
 		return;
 
@@ -38,7 +38,7 @@ static void test_xdp_adjust_tail_grow(void)
 	__u32 duration, retval, size, expect_sz;
 	int err, prog_fd;
 
-	err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
+	err = bpf_prog_test_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
 	if (CHECK_FAIL(err))
 		return;
 
@@ -75,7 +75,7 @@ static void test_xdp_adjust_tail_grow2(void)
 		.data_size_out	= 0, /* Per test */
 	};
 
-	err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &tattr.prog_fd);
+	err = bpf_prog_test_load(file, BPF_PROG_TYPE_XDP, &obj, &tattr.prog_fd);
 	if (CHECK_ATTR(err, "load", "err %d errno %d\n", err, errno))
 		return;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_attach.c
index 4c4057262cd8..c6fa390e3aa1 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_attach.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_attach.c
@@ -16,7 +16,7 @@ void serial_test_xdp_attach(void)
 
 	len = sizeof(info);
 
-	err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj1, &fd1);
+	err = bpf_prog_test_load(file, BPF_PROG_TYPE_XDP, &obj1, &fd1);
 	if (CHECK_FAIL(err))
 		return;
 	err = bpf_obj_get_info_by_fd(fd1, &info, &len);
@@ -24,7 +24,7 @@ void serial_test_xdp_attach(void)
 		goto out_1;
 	id1 = info.id;
 
-	err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj2, &fd2);
+	err = bpf_prog_test_load(file, BPF_PROG_TYPE_XDP, &obj2, &fd2);
 	if (CHECK_FAIL(err))
 		goto out_1;
 
@@ -34,7 +34,7 @@ void serial_test_xdp_attach(void)
 		goto out_2;
 	id2 = info.id;
 
-	err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj3, &fd3);
+	err = bpf_prog_test_load(file, BPF_PROG_TYPE_XDP, &obj3, &fd3);
 	if (CHECK_FAIL(err))
 		goto out_2;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_info.c b/tools/testing/selftests/bpf/prog_tests/xdp_info.c
index 4e2a4fd56f67..abe48e82e1dc 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_info.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_info.c
@@ -29,7 +29,7 @@ void serial_test_xdp_info(void)
 
 	/* Setup prog */
 
-	err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
+	err = bpf_prog_test_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
 	if (CHECK_FAIL(err))
 		return;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_perf.c b/tools/testing/selftests/bpf/prog_tests/xdp_perf.c
index 7185bee16fe4..15a3900e4370 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_perf.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_perf.c
@@ -9,7 +9,7 @@ void test_xdp_perf(void)
 	char in[128], out[128];
 	int err, prog_fd;
 
-	err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
+	err = bpf_prog_test_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
 	if (CHECK_FAIL(err))
 		return;
 
diff --git a/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c
index 49a84a3a2306..48cd14b43741 100644
--- a/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c
+++ b/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c
@@ -73,7 +73,7 @@ int test_subprog2(struct args_subprog2 *ctx)
 			      __builtin_preserve_access_index(&skb->len));
 
 	ret = ctx->ret;
-	/* bpf_prog_load() loads "test_pkt_access.o" with BPF_F_TEST_RND_HI32
+	/* bpf_prog_test_load() loads "test_pkt_access.o" with BPF_F_TEST_RND_HI32
 	 * which randomizes upper 32 bits after BPF_ALU32 insns.
 	 * Hence after 'w0 <<= 1' upper bits of $rax are random.
 	 * That is expected and correct. Trim them.
diff --git a/tools/testing/selftests/bpf/test_dev_cgroup.c b/tools/testing/selftests/bpf/test_dev_cgroup.c
index 804dddd97d4c..c299d3452695 100644
--- a/tools/testing/selftests/bpf/test_dev_cgroup.c
+++ b/tools/testing/selftests/bpf/test_dev_cgroup.c
@@ -14,6 +14,7 @@
 #include <bpf/libbpf.h>
 
 #include "cgroup_helpers.h"
+#include "testing_helpers.h"
 #include "bpf_rlimit.h"
 
 #define DEV_CGROUP_PROG "./dev_cgroup.o"
@@ -27,7 +28,7 @@ int main(int argc, char **argv)
 	int prog_fd, cgroup_fd;
 	__u32 prog_cnt;
 
-	if (bpf_prog_load(DEV_CGROUP_PROG, BPF_PROG_TYPE_CGROUP_DEVICE,
+	if (bpf_prog_test_load(DEV_CGROUP_PROG, BPF_PROG_TYPE_CGROUP_DEVICE,
 			  &obj, &prog_fd)) {
 		printf("Failed to load DEV_CGROUP program\n");
 		goto out;
diff --git a/tools/testing/selftests/bpf/test_lirc_mode2_user.c b/tools/testing/selftests/bpf/test_lirc_mode2_user.c
index fb5fd6841ef3..ebf68dce5504 100644
--- a/tools/testing/selftests/bpf/test_lirc_mode2_user.c
+++ b/tools/testing/selftests/bpf/test_lirc_mode2_user.c
@@ -45,6 +45,8 @@
 #include <bpf/bpf.h>
 #include <bpf/libbpf.h>
 
+#include "testing_helpers.h"
+
 int main(int argc, char **argv)
 {
 	struct bpf_object *obj;
@@ -58,8 +60,8 @@ int main(int argc, char **argv)
 		return 2;
 	}
 
-	ret = bpf_prog_load("test_lirc_mode2_kern.o",
-			    BPF_PROG_TYPE_LIRC_MODE2, &obj, &progfd);
+	ret = bpf_prog_test_load("test_lirc_mode2_kern.o",
+				 BPF_PROG_TYPE_LIRC_MODE2, &obj, &progfd);
 	if (ret) {
 		printf("Failed to load bpf program\n");
 		return 1;
diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index c7a36a9378f8..8b31bc1a801d 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -25,6 +25,7 @@
 #include "bpf_util.h"
 #include "bpf_rlimit.h"
 #include "test_maps.h"
+#include "testing_helpers.h"
 
 #ifndef ENOTSUPP
 #define ENOTSUPP 524
@@ -830,21 +831,21 @@ static void test_sockmap(unsigned int tasks, void *data)
 	}
 
 	/* Load SK_SKB program and Attach */
-	err = bpf_prog_load(SOCKMAP_PARSE_PROG,
+	err = bpf_prog_test_load(SOCKMAP_PARSE_PROG,
 			    BPF_PROG_TYPE_SK_SKB, &obj, &parse_prog);
 	if (err) {
 		printf("Failed to load SK_SKB parse prog\n");
 		goto out_sockmap;
 	}
 
-	err = bpf_prog_load(SOCKMAP_TCP_MSG_PROG,
+	err = bpf_prog_test_load(SOCKMAP_TCP_MSG_PROG,
 			    BPF_PROG_TYPE_SK_MSG, &obj, &msg_prog);
 	if (err) {
 		printf("Failed to load SK_SKB msg prog\n");
 		goto out_sockmap;
 	}
 
-	err = bpf_prog_load(SOCKMAP_VERDICT_PROG,
+	err = bpf_prog_test_load(SOCKMAP_VERDICT_PROG,
 			    BPF_PROG_TYPE_SK_SKB, &obj, &verdict_prog);
 	if (err) {
 		printf("Failed to load SK_SKB verdict prog\n");
diff --git a/tools/testing/selftests/bpf/test_sysctl.c b/tools/testing/selftests/bpf/test_sysctl.c
index 4a395d7a8ea9..4f6cf833b522 100644
--- a/tools/testing/selftests/bpf/test_sysctl.c
+++ b/tools/testing/selftests/bpf/test_sysctl.c
@@ -17,6 +17,7 @@
 #include "bpf_rlimit.h"
 #include "bpf_util.h"
 #include "cgroup_helpers.h"
+#include "testing_helpers.h"
 
 #define CG_PATH			"/foo"
 #define MAX_INSNS		512
diff --git a/tools/testing/selftests/bpf/test_tcpnotify_user.c b/tools/testing/selftests/bpf/test_tcpnotify_user.c
index 4a39304cc5a6..63111cb082fe 100644
--- a/tools/testing/selftests/bpf/test_tcpnotify_user.c
+++ b/tools/testing/selftests/bpf/test_tcpnotify_user.c
@@ -25,6 +25,7 @@
 
 #include "test_tcpnotify.h"
 #include "trace_helpers.h"
+#include "testing_helpers.h"
 
 #define SOCKET_BUFFER_SIZE (getpagesize() < 8192L ? getpagesize() : 8192L)
 
@@ -92,7 +93,7 @@ int main(int argc, char **argv)
 	if (cg_fd < 0)
 		goto err;
 
-	if (bpf_prog_load(file, BPF_PROG_TYPE_SOCK_OPS, &obj, &prog_fd)) {
+	if (bpf_prog_test_load(file, BPF_PROG_TYPE_SOCK_OPS, &obj, &prog_fd)) {
 		printf("FAILED: load_bpf_file failed for: %s\n", file);
 		goto err;
 	}
diff --git a/tools/testing/selftests/bpf/xdping.c b/tools/testing/selftests/bpf/xdping.c
index 30f12637f4e4..baa870a759a2 100644
--- a/tools/testing/selftests/bpf/xdping.c
+++ b/tools/testing/selftests/bpf/xdping.c
@@ -22,6 +22,7 @@
 #include "bpf/libbpf.h"
 
 #include "xdping.h"
+#include "testing_helpers.h"
 
 static int ifindex;
 static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
@@ -173,7 +174,7 @@ int main(int argc, char **argv)
 
 	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
 
-	if (bpf_prog_load(filename, BPF_PROG_TYPE_XDP, &obj, &prog_fd)) {
+	if (bpf_prog_test_load(filename, BPF_PROG_TYPE_XDP, &obj, &prog_fd)) {
 		fprintf(stderr, "load of %s failed\n", filename);
 		return 1;
 	}
-- 
cgit v1.2.3


From f19ddfe0360a1aa64db0b4a41f59e1ade3f6d288 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 3 Nov 2021 15:08:45 -0700
Subject: selftests/bpf: Use explicit bpf_test_load_program() helper calls

Remove the second part of prog loading testing helper re-definition:

  -Dbpf_load_program=bpf_test_load_program

This completes the clean up of deprecated libbpf program loading APIs.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/bpf/20211103220845.2676888-13-andrii@kernel.org
---
 tools/testing/selftests/bpf/Makefile                               | 3 +--
 tools/testing/selftests/bpf/prog_tests/cgroup_attach_autodetach.c  | 2 +-
 tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c       | 2 +-
 tools/testing/selftests/bpf/prog_tests/cgroup_attach_override.c    | 2 +-
 tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c | 2 +-
 tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c   | 4 ++--
 tools/testing/selftests/bpf/prog_tests/signal_pending.c            | 2 +-
 tools/testing/selftests/bpf/test_cgroup_storage.c                  | 3 ++-
 tools/testing/selftests/bpf/test_tag.c                             | 3 ++-
 9 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 2016c583ed20..e19cc6473936 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -23,8 +23,7 @@ BPF_GCC		?= $(shell command -v bpf-gcc;)
 SAN_CFLAGS	?=
 CFLAGS += -g -O0 -rdynamic -Wall $(GENFLAGS) $(SAN_CFLAGS)		\
 	  -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR)		\
-	  -I$(TOOLSINCDIR) -I$(APIDIR) -I$(OUTPUT)			\
-	  -Dbpf_load_program=bpf_test_load_program
+	  -I$(TOOLSINCDIR) -I$(APIDIR) -I$(OUTPUT)
 LDLIBS += -lcap -lelf -lz -lrt -lpthread
 
 # Silence some warnings when compiled with clang
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_autodetach.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_autodetach.c
index 5de485c7370f..858916d11e2e 100644
--- a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_autodetach.c
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_autodetach.c
@@ -16,7 +16,7 @@ static int prog_load(void)
 	};
 	size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
 
-	return bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB,
+	return bpf_test_load_program(BPF_PROG_TYPE_CGROUP_SKB,
 			       prog, insns_cnt, "GPL", 0,
 			       bpf_log_buf, BPF_LOG_BUF_SIZE);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c
index 731bea84d8ed..de9c3e12b0ea 100644
--- a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c
@@ -66,7 +66,7 @@ static int prog_load_cnt(int verdict, int val)
 	size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
 	int ret;
 
-	ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB,
+	ret = bpf_test_load_program(BPF_PROG_TYPE_CGROUP_SKB,
 			       prog, insns_cnt, "GPL", 0,
 			       bpf_log_buf, BPF_LOG_BUF_SIZE);
 
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_override.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_override.c
index 10d3c33821a7..356547e849e2 100644
--- a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_override.c
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_override.c
@@ -18,7 +18,7 @@ static int prog_load(int verdict)
 	};
 	size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
 
-	return bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB,
+	return bpf_test_load_program(BPF_PROG_TYPE_CGROUP_SKB,
 			       prog, insns_cnt, "GPL", 0,
 			       bpf_log_buf, BPF_LOG_BUF_SIZE);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c
index 6093728497c7..93ac3f28226c 100644
--- a/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c
+++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c
@@ -30,7 +30,7 @@ void serial_test_flow_dissector_load_bytes(void)
 
 	/* make sure bpf_skb_load_bytes is not allowed from skb-less context
 	 */
-	fd = bpf_load_program(BPF_PROG_TYPE_FLOW_DISSECTOR, prog,
+	fd = bpf_test_load_program(BPF_PROG_TYPE_FLOW_DISSECTOR, prog,
 			      ARRAY_SIZE(prog), "GPL", 0, NULL, 0);
 	CHECK(fd < 0,
 	      "flow_dissector-bpf_skb_load_bytes-load",
diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c
index f0c6c226aba8..7c79462d2702 100644
--- a/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c
+++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c
@@ -47,9 +47,9 @@ static int load_prog(enum bpf_prog_type type)
 	};
 	int fd;
 
-	fd = bpf_load_program(type, prog, ARRAY_SIZE(prog), "GPL", 0, NULL, 0);
+	fd = bpf_test_load_program(type, prog, ARRAY_SIZE(prog), "GPL", 0, NULL, 0);
 	if (CHECK_FAIL(fd < 0))
-		perror("bpf_load_program");
+		perror("bpf_test_load_program");
 
 	return fd;
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/signal_pending.c b/tools/testing/selftests/bpf/prog_tests/signal_pending.c
index fdfdcff6cbef..aecfe662c070 100644
--- a/tools/testing/selftests/bpf/prog_tests/signal_pending.c
+++ b/tools/testing/selftests/bpf/prog_tests/signal_pending.c
@@ -22,7 +22,7 @@ static void test_signal_pending_by_type(enum bpf_prog_type prog_type)
 		prog[i] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0);
 	prog[ARRAY_SIZE(prog) - 1] = BPF_EXIT_INSN();
 
-	prog_fd = bpf_load_program(prog_type, prog, ARRAY_SIZE(prog),
+	prog_fd = bpf_test_load_program(prog_type, prog, ARRAY_SIZE(prog),
 				   "GPL", 0, NULL, 0);
 	CHECK(prog_fd < 0, "test-run", "errno %d\n", errno);
 
diff --git a/tools/testing/selftests/bpf/test_cgroup_storage.c b/tools/testing/selftests/bpf/test_cgroup_storage.c
index 0cda61da5d39..a63787e7bb1a 100644
--- a/tools/testing/selftests/bpf/test_cgroup_storage.c
+++ b/tools/testing/selftests/bpf/test_cgroup_storage.c
@@ -8,6 +8,7 @@
 
 #include "bpf_rlimit.h"
 #include "cgroup_helpers.h"
+#include "testing_helpers.h"
 
 char bpf_log_buf[BPF_LOG_BUF_SIZE];
 
@@ -66,7 +67,7 @@ int main(int argc, char **argv)
 
 	prog[0].imm = percpu_map_fd;
 	prog[7].imm = map_fd;
-	prog_fd = bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB,
+	prog_fd = bpf_test_load_program(BPF_PROG_TYPE_CGROUP_SKB,
 				   prog, insns_cnt, "GPL", 0,
 				   bpf_log_buf, BPF_LOG_BUF_SIZE);
 	if (prog_fd < 0) {
diff --git a/tools/testing/selftests/bpf/test_tag.c b/tools/testing/selftests/bpf/test_tag.c
index 6272c784ca2a..5c7bea525626 100644
--- a/tools/testing/selftests/bpf/test_tag.c
+++ b/tools/testing/selftests/bpf/test_tag.c
@@ -21,6 +21,7 @@
 
 #include "../../../include/linux/filter.h"
 #include "bpf_rlimit.h"
+#include "testing_helpers.h"
 
 static struct bpf_insn prog[BPF_MAXINSNS];
 
@@ -57,7 +58,7 @@ static int bpf_try_load_prog(int insns, int fd_map,
 	int fd_prog;
 
 	bpf_filler(insns, fd_map);
-	fd_prog = bpf_load_program(BPF_PROG_TYPE_SCHED_CLS, prog, insns, "", 0,
+	fd_prog = bpf_test_load_program(BPF_PROG_TYPE_SCHED_CLS, prog, insns, "", 0,
 				   NULL, 0);
 	assert(fd_prog > 0);
 	if (fd_map > 0)
-- 
cgit v1.2.3


From 2a2cb45b727b7a1041f3d3d93414b774e66454bb Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sun, 7 Nov 2021 08:55:13 -0800
Subject: selftests/bpf: Pass sanitizer flags to linker through LDFLAGS

When adding -fsanitize=address to SAN_CFLAGS, it has to be passed both
to compiler through CFLAGS as well as linker through LDFLAGS. Add
SAN_CFLAGS into LDFLAGS to allow building selftests with ASAN.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Hengqi Chen <hengqi.chen@gmail.com>
Link: https://lore.kernel.org/bpf/20211107165521.9240-2-andrii@kernel.org
---
 tools/testing/selftests/bpf/Makefile | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index e19cc6473936..0468ea57650d 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -24,6 +24,7 @@ SAN_CFLAGS	?=
 CFLAGS += -g -O0 -rdynamic -Wall $(GENFLAGS) $(SAN_CFLAGS)		\
 	  -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR)		\
 	  -I$(TOOLSINCDIR) -I$(APIDIR) -I$(OUTPUT)
+LDFLAGS += $(SAN_CFLAGS)
 LDLIBS += -lcap -lelf -lz -lrt -lpthread
 
 # Silence some warnings when compiled with clang
-- 
cgit v1.2.3


From 8f7b239ea8cfdc8e64c875ee417fed41431a1f37 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sun, 7 Nov 2021 08:55:14 -0800
Subject: libbpf: Free up resources used by inner map definition

It's not enough to just free(map->inner_map), as inner_map itself can
have extra memory allocated, like map name.

Fixes: 646f02ffdd49 ("libbpf: Add BTF-defined map-in-map support")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Hengqi Chen <hengqi.chen@gmail.com>
Link: https://lore.kernel.org/bpf/20211107165521.9240-3-andrii@kernel.org
---
 tools/lib/bpf/libbpf.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index dfd15cc60ea7..d869ebee1e27 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -9009,7 +9009,10 @@ int bpf_map__set_inner_map_fd(struct bpf_map *map, int fd)
 		pr_warn("error: inner_map_fd already specified\n");
 		return libbpf_err(-EINVAL);
 	}
-	zfree(&map->inner_map);
+	if (map->inner_map) {
+		bpf_map__destroy(map->inner_map);
+		zfree(&map->inner_map);
+	}
 	map->inner_map_fd = fd;
 	return 0;
 }
-- 
cgit v1.2.3


From 8ba285874913da21ca39a46376e9cc5ce0f45f94 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sun, 7 Nov 2021 08:55:15 -0800
Subject: selftests/bpf: Fix memory leaks in btf_type_c_dump() helper

Free up memory and resources used by temporary allocated memstream and
btf_dump instance.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Hengqi Chen <hengqi.chen@gmail.com>
Link: https://lore.kernel.org/bpf/20211107165521.9240-4-andrii@kernel.org
---
 tools/testing/selftests/bpf/btf_helpers.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/btf_helpers.c b/tools/testing/selftests/bpf/btf_helpers.c
index b5b6b013a245..3d1a748d09d8 100644
--- a/tools/testing/selftests/bpf/btf_helpers.c
+++ b/tools/testing/selftests/bpf/btf_helpers.c
@@ -251,18 +251,23 @@ const char *btf_type_c_dump(const struct btf *btf)
 	d = btf_dump__new(btf, NULL, &opts, btf_dump_printf);
 	if (libbpf_get_error(d)) {
 		fprintf(stderr, "Failed to create btf_dump instance: %ld\n", libbpf_get_error(d));
-		return NULL;
+		goto err_out;
 	}
 
 	for (i = 1; i < btf__type_cnt(btf); i++) {
 		err = btf_dump__dump_type(d, i);
 		if (err) {
 			fprintf(stderr, "Failed to dump type [%d]: %d\n", i, err);
-			return NULL;
+			goto err_out;
 		}
 	}
 
+	btf_dump__free(d);
 	fflush(buf_file);
 	fclose(buf_file);
 	return buf;
+err_out:
+	btf_dump__free(d);
+	fclose(buf_file);
+	return NULL;
 }
-- 
cgit v1.2.3


From b8b26e585f3a0fbcee1032c622f046787da57390 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sun, 7 Nov 2021 08:55:16 -0800
Subject: selftests/bpf: Free per-cpu values array in bpf_iter selftest

Array holding per-cpu values wasn't freed. Fix that.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211107165521.9240-5-andrii@kernel.org
---
 tools/testing/selftests/bpf/prog_tests/bpf_iter.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
index 9454331aaf85..3e10abce3e5a 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
@@ -699,14 +699,13 @@ static void test_bpf_percpu_hash_map(void)
 	char buf[64];
 	void *val;
 
-	val = malloc(8 * bpf_num_possible_cpus());
-
 	skel = bpf_iter_bpf_percpu_hash_map__open();
 	if (CHECK(!skel, "bpf_iter_bpf_percpu_hash_map__open",
 		  "skeleton open failed\n"))
 		return;
 
 	skel->rodata->num_cpus = bpf_num_possible_cpus();
+	val = malloc(8 * bpf_num_possible_cpus());
 
 	err = bpf_iter_bpf_percpu_hash_map__load(skel);
 	if (CHECK(!skel, "bpf_iter_bpf_percpu_hash_map__load",
@@ -770,6 +769,7 @@ free_link:
 	bpf_link__destroy(link);
 out:
 	bpf_iter_bpf_percpu_hash_map__destroy(skel);
+	free(val);
 }
 
 static void test_bpf_array_map(void)
@@ -870,14 +870,13 @@ static void test_bpf_percpu_array_map(void)
 	void *val;
 	int len;
 
-	val = malloc(8 * bpf_num_possible_cpus());
-
 	skel = bpf_iter_bpf_percpu_array_map__open();
 	if (CHECK(!skel, "bpf_iter_bpf_percpu_array_map__open",
 		  "skeleton open failed\n"))
 		return;
 
 	skel->rodata->num_cpus = bpf_num_possible_cpus();
+	val = malloc(8 * bpf_num_possible_cpus());
 
 	err = bpf_iter_bpf_percpu_array_map__load(skel);
 	if (CHECK(!skel, "bpf_iter_bpf_percpu_array_map__load",
@@ -933,6 +932,7 @@ free_link:
 	bpf_link__destroy(link);
 out:
 	bpf_iter_bpf_percpu_array_map__destroy(skel);
+	free(val);
 }
 
 /* An iterator program deletes all local storage in a map. */
-- 
cgit v1.2.3


From 5309b516bcc6f76dda0e44a7a1824324277093d6 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sun, 7 Nov 2021 08:55:17 -0800
Subject: selftests/bpf: Free inner strings index in btf selftest

Inner array of allocated strings wasn't freed on success. Now it's
always freed.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Hengqi Chen <hengqi.chen@gmail.com>
Link: https://lore.kernel.org/bpf/20211107165521.9240-6-andrii@kernel.org
---
 tools/testing/selftests/bpf/prog_tests/btf.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c
index ac596cb06e40..ebd1aa4d09d6 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf.c
@@ -4046,11 +4046,9 @@ static void *btf_raw_create(const struct btf_header *hdr,
 			next_str_idx < strs_cnt ? strs_idx[next_str_idx] : NULL;
 
 done:
+	free(strs_idx);
 	if (err) {
-		if (raw_btf)
-			free(raw_btf);
-		if (strs_idx)
-			free(strs_idx);
+		free(raw_btf);
 		return NULL;
 	}
 	return raw_btf;
-- 
cgit v1.2.3


From f79587520a6007a3734b23a3c2eb4c62aa457533 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sun, 7 Nov 2021 08:55:18 -0800
Subject: selftests/bpf: Clean up btf and btf_dump in dump_datasec test

Free up used resources at the end and on error. Also make it more
obvious that there is btf__parse() call that creates struct btf
instance.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Hengqi Chen <hengqi.chen@gmail.com>
Link: https://lore.kernel.org/bpf/20211107165521.9240-7-andrii@kernel.org
---
 tools/testing/selftests/bpf/prog_tests/btf_dump.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
index aa76360d8f49..a04961942dfa 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
@@ -814,21 +814,25 @@ static void test_btf_datasec(struct btf *btf, struct btf_dump *d, char *str,
 
 static void test_btf_dump_datasec_data(char *str)
 {
-	struct btf *btf = btf__parse("xdping_kern.o", NULL);
+	struct btf *btf;
 	struct btf_dump_opts opts = { .ctx = str };
 	char license[4] = "GPL";
 	struct btf_dump *d;
 
+	btf = btf__parse("xdping_kern.o", NULL);
 	if (!ASSERT_OK_PTR(btf, "xdping_kern.o BTF not found"))
 		return;
 
 	d = btf_dump__new(btf, NULL, &opts, btf_dump_snprintf);
 	if (!ASSERT_OK_PTR(d, "could not create BTF dump"))
-		return;
+		goto out;
 
 	test_btf_datasec(btf, d, str, "license",
 			 "SEC(\"license\") char[4] _license = (char[4])['G','P','L',];",
 			 license, sizeof(license));
+out:
+	btf_dump__free(d);
+	btf__free(btf);
 }
 
 void test_btf_dump() {
-- 
cgit v1.2.3


From f92321d706a810b89a905e04658e38931c4bb0e0 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sun, 7 Nov 2021 08:55:19 -0800
Subject: selftests/bpf: Avoid duplicate btf__parse() call

btf__parse() is repeated after successful setup, leaving the first
instance leaked. Remove redundant and premature call.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Hengqi Chen <hengqi.chen@gmail.com>
Link: https://lore.kernel.org/bpf/20211107165521.9240-8-andrii@kernel.org
---
 tools/testing/selftests/bpf/prog_tests/core_reloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c
index 55ec85ba7375..1041d0c593f6 100644
--- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c
+++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c
@@ -433,7 +433,7 @@ static int setup_type_id_case_local(struct core_reloc_test_case *test)
 
 static int setup_type_id_case_success(struct core_reloc_test_case *test) {
 	struct core_reloc_type_id_output *exp = (void *)test->output;
-	struct btf *targ_btf = btf__parse(test->btf_src_file, NULL);
+	struct btf *targ_btf;
 	int err;
 
 	err = setup_type_id_case_local(test);
-- 
cgit v1.2.3


From f91231eeeed752119f49eb6620cae44ec745a007 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sun, 7 Nov 2021 08:55:20 -0800
Subject: selftests/bpf: Destroy XDP link correctly

bpf_link__detach() was confused with bpf_link__destroy() and leaves
leaked FD in the process. Fix the problem.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Hengqi Chen <hengqi.chen@gmail.com>
Link: https://lore.kernel.org/bpf/20211107165521.9240-9-andrii@kernel.org
---
 tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c b/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
index 7589c03fd26b..eb2feaac81fe 100644
--- a/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
+++ b/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
@@ -204,8 +204,8 @@ static int pass_ack(struct migrate_reuseport_test_case *test_case)
 {
 	int err;
 
-	err = bpf_link__detach(test_case->link);
-	if (!ASSERT_OK(err, "bpf_link__detach"))
+	err = bpf_link__destroy(test_case->link);
+	if (!ASSERT_OK(err, "bpf_link__destroy"))
 		return -1;
 
 	test_case->link = NULL;
-- 
cgit v1.2.3


From 8c7a95520184b6677ca6075e12df9c208d57d088 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sun, 7 Nov 2021 08:55:21 -0800
Subject: selftests/bpf: Fix bpf_object leak in skb_ctx selftest

skb_ctx selftest didn't close bpf_object implicitly allocated by
bpf_prog_test_load() helper. Fix the problem by explicitly calling
bpf_object__close() at the end of the test.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Hengqi Chen <hengqi.chen@gmail.com>
Link: https://lore.kernel.org/bpf/20211107165521.9240-10-andrii@kernel.org
---
 tools/testing/selftests/bpf/prog_tests/skb_ctx.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/skb_ctx.c b/tools/testing/selftests/bpf/prog_tests/skb_ctx.c
index d3106078838c..b5319ba2ee27 100644
--- a/tools/testing/selftests/bpf/prog_tests/skb_ctx.c
+++ b/tools/testing/selftests/bpf/prog_tests/skb_ctx.c
@@ -111,4 +111,6 @@ void test_skb_ctx(void)
 		   "ctx_out_mark",
 		   "skb->mark == %u, expected %d\n",
 		   skb.mark, 10);
+
+	bpf_object__close(obj);
 }
-- 
cgit v1.2.3


From 7c7e3d31e7856a8260a254f8c71db416f7f9f5a1 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Fri, 5 Nov 2021 16:23:29 -0700
Subject: bpf: Introduce helper bpf_find_vma

In some profiler use cases, it is necessary to map an address to the
backing file, e.g., a shared library. bpf_find_vma helper provides a
flexible way to achieve this. bpf_find_vma maps an address of a task to
the vma (vm_area_struct) for this address, and feed the vma to an callback
BPF function. The callback function is necessary here, as we need to
ensure mmap_sem is unlocked.

It is necessary to lock mmap_sem for find_vma. To lock and unlock mmap_sem
safely when irqs are disable, we use the same mechanism as stackmap with
build_id. Specifically, when irqs are disabled, the unlocked is postponed
in an irq_work. Refactor stackmap.c so that the irq_work is shared among
bpf_find_vma and stackmap helpers.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Hengqi Chen <hengqi.chen@gmail.com>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20211105232330.1936330-2-songliubraving@fb.com
---
 include/linux/bpf.h            |  1 +
 include/uapi/linux/bpf.h       | 20 +++++++++++
 kernel/bpf/btf.c               |  5 ++-
 kernel/bpf/mmap_unlock_work.h  | 65 ++++++++++++++++++++++++++++++++++
 kernel/bpf/stackmap.c          | 80 ++++--------------------------------------
 kernel/bpf/task_iter.c         | 76 +++++++++++++++++++++++++++++++++++----
 kernel/bpf/verifier.c          | 34 ++++++++++++++++++
 kernel/trace/bpf_trace.c       |  2 ++
 tools/include/uapi/linux/bpf.h | 20 +++++++++++
 9 files changed, 222 insertions(+), 81 deletions(-)
 create mode 100644 kernel/bpf/mmap_unlock_work.h

(limited to 'tools')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2be6dfd68df9..df3410bff4b0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2157,6 +2157,7 @@ extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto;
 extern const struct bpf_func_proto bpf_sk_setsockopt_proto;
 extern const struct bpf_func_proto bpf_sk_getsockopt_proto;
 extern const struct bpf_func_proto bpf_kallsyms_lookup_name_proto;
+extern const struct bpf_func_proto bpf_find_vma_proto;
 
 const struct bpf_func_proto *tracing_prog_func_proto(
   enum bpf_func_id func_id, const struct bpf_prog *prog);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ba5af15e25f5..509eee5f0393 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4938,6 +4938,25 @@ union bpf_attr {
  *		**-ENOENT** if symbol is not found.
  *
  *		**-EPERM** if caller does not have permission to obtain kernel address.
+ *
+ * long bpf_find_vma(struct task_struct *task, u64 addr, void *callback_fn, void *callback_ctx, u64 flags)
+ *	Description
+ *		Find vma of *task* that contains *addr*, call *callback_fn*
+ *		function with *task*, *vma*, and *callback_ctx*.
+ *		The *callback_fn* should be a static function and
+ *		the *callback_ctx* should be a pointer to the stack.
+ *		The *flags* is used to control certain aspects of the helper.
+ *		Currently, the *flags* must be 0.
+ *
+ *		The expected callback signature is
+ *
+ *		long (\*callback_fn)(struct task_struct \*task, struct vm_area_struct \*vma, void \*callback_ctx);
+ *
+ *	Return
+ *		0 on success.
+ *		**-ENOENT** if *task->mm* is NULL, or no vma contains *addr*.
+ *		**-EBUSY** if failed to try lock mmap_lock.
+ *		**-EINVAL** for invalid **flags**.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5120,6 +5139,7 @@ union bpf_attr {
 	FN(trace_vprintk),		\
 	FN(skc_to_unix_sock),		\
 	FN(kallsyms_lookup_name),	\
+	FN(find_vma),			\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index dbc3ad07e21b..cdb0fba65600 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6342,7 +6342,10 @@ const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
-BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct)
+BTF_ID_LIST_GLOBAL(btf_task_struct_ids)
+BTF_ID(struct, task_struct)
+BTF_ID(struct, file)
+BTF_ID(struct, vm_area_struct)
 
 /* BTF ID set registration API for modules */
 
diff --git a/kernel/bpf/mmap_unlock_work.h b/kernel/bpf/mmap_unlock_work.h
new file mode 100644
index 000000000000..5d18d7d85bef
--- /dev/null
+++ b/kernel/bpf/mmap_unlock_work.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2021 Facebook
+ */
+
+#ifndef __MMAP_UNLOCK_WORK_H__
+#define __MMAP_UNLOCK_WORK_H__
+#include <linux/irq_work.h>
+
+/* irq_work to run mmap_read_unlock() in irq_work */
+struct mmap_unlock_irq_work {
+	struct irq_work irq_work;
+	struct mm_struct *mm;
+};
+
+DECLARE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
+
+/*
+ * We cannot do mmap_read_unlock() when the irq is disabled, because of
+ * risk to deadlock with rq_lock. To look up vma when the irqs are
+ * disabled, we need to run mmap_read_unlock() in irq_work. We use a
+ * percpu variable to do the irq_work. If the irq_work is already used
+ * by another lookup, we fall over.
+ */
+static inline bool bpf_mmap_unlock_get_irq_work(struct mmap_unlock_irq_work **work_ptr)
+{
+	struct mmap_unlock_irq_work *work = NULL;
+	bool irq_work_busy = false;
+
+	if (irqs_disabled()) {
+		if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+			work = this_cpu_ptr(&mmap_unlock_work);
+			if (irq_work_is_busy(&work->irq_work)) {
+				/* cannot queue more up_read, fallback */
+				irq_work_busy = true;
+			}
+		} else {
+			/*
+			 * PREEMPT_RT does not allow to trylock mmap sem in
+			 * interrupt disabled context. Force the fallback code.
+			 */
+			irq_work_busy = true;
+		}
+	}
+
+	*work_ptr = work;
+	return irq_work_busy;
+}
+
+static inline void bpf_mmap_unlock_mm(struct mmap_unlock_irq_work *work, struct mm_struct *mm)
+{
+	if (!work) {
+		mmap_read_unlock(mm);
+	} else {
+		work->mm = mm;
+
+		/* The lock will be released once we're out of interrupt
+		 * context. Tell lockdep that we've released it now so
+		 * it doesn't complain that we forgot to release it.
+		 */
+		rwsem_release(&mm->mmap_lock.dep_map, _RET_IP_);
+		irq_work_queue(&work->irq_work);
+	}
+}
+
+#endif /* __MMAP_UNLOCK_WORK_H__ */
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 6e75bbee39f0..1de0a1b03636 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -7,10 +7,10 @@
 #include <linux/kernel.h>
 #include <linux/stacktrace.h>
 #include <linux/perf_event.h>
-#include <linux/irq_work.h>
 #include <linux/btf_ids.h>
 #include <linux/buildid.h>
 #include "percpu_freelist.h"
+#include "mmap_unlock_work.h"
 
 #define STACK_CREATE_FLAG_MASK					\
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY |	\
@@ -31,25 +31,6 @@ struct bpf_stack_map {
 	struct stack_map_bucket *buckets[];
 };
 
-/* irq_work to run up_read() for build_id lookup in nmi context */
-struct stack_map_irq_work {
-	struct irq_work irq_work;
-	struct mm_struct *mm;
-};
-
-static void do_up_read(struct irq_work *entry)
-{
-	struct stack_map_irq_work *work;
-
-	if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
-		return;
-
-	work = container_of(entry, struct stack_map_irq_work, irq_work);
-	mmap_read_unlock_non_owner(work->mm);
-}
-
-static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work);
-
 static inline bool stack_map_use_build_id(struct bpf_map *map)
 {
 	return (map->map_flags & BPF_F_STACK_BUILD_ID);
@@ -149,35 +130,13 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 					  u64 *ips, u32 trace_nr, bool user)
 {
 	int i;
+	struct mmap_unlock_irq_work *work = NULL;
+	bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
 	struct vm_area_struct *vma;
-	bool irq_work_busy = false;
-	struct stack_map_irq_work *work = NULL;
-
-	if (irqs_disabled()) {
-		if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
-			work = this_cpu_ptr(&up_read_work);
-			if (irq_work_is_busy(&work->irq_work)) {
-				/* cannot queue more up_read, fallback */
-				irq_work_busy = true;
-			}
-		} else {
-			/*
-			 * PREEMPT_RT does not allow to trylock mmap sem in
-			 * interrupt disabled context. Force the fallback code.
-			 */
-			irq_work_busy = true;
-		}
-	}
 
-	/*
-	 * We cannot do up_read() when the irq is disabled, because of
-	 * risk to deadlock with rq_lock. To do build_id lookup when the
-	 * irqs are disabled, we need to run up_read() in irq_work. We use
-	 * a percpu variable to do the irq_work. If the irq_work is
-	 * already used by another lookup, we fall back to report ips.
-	 *
-	 * Same fallback is used for kernel stack (!user) on a stackmap
-	 * with build_id.
+	/* If the irq_work is in use, fall back to report ips. Same
+	 * fallback is used for kernel stack (!user) on a stackmap with
+	 * build_id.
 	 */
 	if (!user || !current || !current->mm || irq_work_busy ||
 	    !mmap_read_trylock(current->mm)) {
@@ -203,19 +162,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 			- vma->vm_start;
 		id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
 	}
-
-	if (!work) {
-		mmap_read_unlock(current->mm);
-	} else {
-		work->mm = current->mm;
-
-		/* The lock will be released once we're out of interrupt
-		 * context. Tell lockdep that we've released it now so
-		 * it doesn't complain that we forgot to release it.
-		 */
-		rwsem_release(&current->mm->mmap_lock.dep_map, _RET_IP_);
-		irq_work_queue(&work->irq_work);
-	}
+	bpf_mmap_unlock_mm(work, current->mm);
 }
 
 static struct perf_callchain_entry *
@@ -719,16 +666,3 @@ const struct bpf_map_ops stack_trace_map_ops = {
 	.map_btf_name = "bpf_stack_map",
 	.map_btf_id = &stack_trace_map_btf_id,
 };
-
-static int __init stack_map_init(void)
-{
-	int cpu;
-	struct stack_map_irq_work *work;
-
-	for_each_possible_cpu(cpu) {
-		work = per_cpu_ptr(&up_read_work, cpu);
-		init_irq_work(&work->irq_work, do_up_read);
-	}
-	return 0;
-}
-subsys_initcall(stack_map_init);
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index b48750bfba5a..f171479f7dd6 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -8,6 +8,7 @@
 #include <linux/fdtable.h>
 #include <linux/filter.h>
 #include <linux/btf_ids.h>
+#include "mmap_unlock_work.h"
 
 struct bpf_iter_seq_task_common {
 	struct pid_namespace *ns;
@@ -524,10 +525,6 @@ static const struct seq_operations task_vma_seq_ops = {
 	.show	= task_vma_seq_show,
 };
 
-BTF_ID_LIST(btf_task_file_ids)
-BTF_ID(struct, file)
-BTF_ID(struct, vm_area_struct)
-
 static const struct bpf_iter_seq_info task_seq_info = {
 	.seq_ops		= &task_seq_ops,
 	.init_seq_private	= init_seq_pidns,
@@ -586,9 +583,74 @@ static struct bpf_iter_reg task_vma_reg_info = {
 	.seq_info		= &task_vma_seq_info,
 };
 
+BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start,
+	   bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags)
+{
+	struct mmap_unlock_irq_work *work = NULL;
+	struct vm_area_struct *vma;
+	bool irq_work_busy = false;
+	struct mm_struct *mm;
+	int ret = -ENOENT;
+
+	if (flags)
+		return -EINVAL;
+
+	if (!task)
+		return -ENOENT;
+
+	mm = task->mm;
+	if (!mm)
+		return -ENOENT;
+
+	irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
+
+	if (irq_work_busy || !mmap_read_trylock(mm))
+		return -EBUSY;
+
+	vma = find_vma(mm, start);
+
+	if (vma && vma->vm_start <= start && vma->vm_end > start) {
+		callback_fn((u64)(long)task, (u64)(long)vma,
+			    (u64)(long)callback_ctx, 0, 0);
+		ret = 0;
+	}
+	bpf_mmap_unlock_mm(work, mm);
+	return ret;
+}
+
+const struct bpf_func_proto bpf_find_vma_proto = {
+	.func		= bpf_find_vma,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_BTF_ID,
+	.arg1_btf_id	= &btf_task_struct_ids[0],
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_FUNC,
+	.arg4_type	= ARG_PTR_TO_STACK_OR_NULL,
+	.arg5_type	= ARG_ANYTHING,
+};
+
+DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
+
+static void do_mmap_read_unlock(struct irq_work *entry)
+{
+	struct mmap_unlock_irq_work *work;
+
+	if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
+		return;
+
+	work = container_of(entry, struct mmap_unlock_irq_work, irq_work);
+	mmap_read_unlock_non_owner(work->mm);
+}
+
 static int __init task_iter_init(void)
 {
-	int ret;
+	struct mmap_unlock_irq_work *work;
+	int ret, cpu;
+
+	for_each_possible_cpu(cpu) {
+		work = per_cpu_ptr(&mmap_unlock_work, cpu);
+		init_irq_work(&work->irq_work, do_mmap_read_unlock);
+	}
 
 	task_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
 	ret = bpf_iter_reg_target(&task_reg_info);
@@ -596,13 +658,13 @@ static int __init task_iter_init(void)
 		return ret;
 
 	task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
-	task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[0];
+	task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_struct_ids[1];
 	ret =  bpf_iter_reg_target(&task_file_reg_info);
 	if (ret)
 		return ret;
 
 	task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
-	task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1];
+	task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_struct_ids[2];
 	return bpf_iter_reg_target(&task_vma_reg_info);
 }
 late_initcall(task_iter_init);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f0dca726ebfd..1aafb43f61d1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6132,6 +6132,33 @@ static int set_timer_callback_state(struct bpf_verifier_env *env,
 	return 0;
 }
 
+static int set_find_vma_callback_state(struct bpf_verifier_env *env,
+				       struct bpf_func_state *caller,
+				       struct bpf_func_state *callee,
+				       int insn_idx)
+{
+	/* bpf_find_vma(struct task_struct *task, u64 addr,
+	 *               void *callback_fn, void *callback_ctx, u64 flags)
+	 * (callback_fn)(struct task_struct *task,
+	 *               struct vm_area_struct *vma, void *callback_ctx);
+	 */
+	callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1];
+
+	callee->regs[BPF_REG_2].type = PTR_TO_BTF_ID;
+	__mark_reg_known_zero(&callee->regs[BPF_REG_2]);
+	callee->regs[BPF_REG_2].btf =  btf_vmlinux;
+	callee->regs[BPF_REG_2].btf_id = btf_task_struct_ids[2];
+
+	/* pointer to stack or null */
+	callee->regs[BPF_REG_3] = caller->regs[BPF_REG_4];
+
+	/* unused */
+	__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+	callee->in_callback_fn = true;
+	return 0;
+}
+
 static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 {
 	struct bpf_verifier_state *state = env->cur_state;
@@ -6489,6 +6516,13 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 			return -EINVAL;
 	}
 
+	if (func_id == BPF_FUNC_find_vma) {
+		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+					set_find_vma_callback_state);
+		if (err < 0)
+			return -EINVAL;
+	}
+
 	if (func_id == BPF_FUNC_snprintf) {
 		err = check_bpf_snprintf_call(env, regs);
 		if (err < 0)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 7396488793ff..390176a3031a 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1208,6 +1208,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_func_ip_proto_tracing;
 	case BPF_FUNC_get_branch_snapshot:
 		return &bpf_get_branch_snapshot_proto;
+	case BPF_FUNC_find_vma:
+		return &bpf_find_vma_proto;
 	case BPF_FUNC_trace_vprintk:
 		return bpf_get_trace_vprintk_proto();
 	default:
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index ba5af15e25f5..509eee5f0393 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4938,6 +4938,25 @@ union bpf_attr {
  *		**-ENOENT** if symbol is not found.
  *
  *		**-EPERM** if caller does not have permission to obtain kernel address.
+ *
+ * long bpf_find_vma(struct task_struct *task, u64 addr, void *callback_fn, void *callback_ctx, u64 flags)
+ *	Description
+ *		Find vma of *task* that contains *addr*, call *callback_fn*
+ *		function with *task*, *vma*, and *callback_ctx*.
+ *		The *callback_fn* should be a static function and
+ *		the *callback_ctx* should be a pointer to the stack.
+ *		The *flags* is used to control certain aspects of the helper.
+ *		Currently, the *flags* must be 0.
+ *
+ *		The expected callback signature is
+ *
+ *		long (\*callback_fn)(struct task_struct \*task, struct vm_area_struct \*vma, void \*callback_ctx);
+ *
+ *	Return
+ *		0 on success.
+ *		**-ENOENT** if *task->mm* is NULL, or no vma contains *addr*.
+ *		**-EBUSY** if failed to try lock mmap_lock.
+ *		**-EINVAL** for invalid **flags**.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5120,6 +5139,7 @@ union bpf_attr {
 	FN(trace_vprintk),		\
 	FN(skc_to_unix_sock),		\
 	FN(kallsyms_lookup_name),	\
+	FN(find_vma),			\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3


From f108662b27c96cdadfadd39f0c0d650704cd593d Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Fri, 5 Nov 2021 16:23:30 -0700
Subject: selftests/bpf: Add tests for bpf_find_vma

Add tests for bpf_find_vma in perf_event program and kprobe program. The
perf_event program is triggered from NMI context, so the second call of
bpf_find_vma() will return -EBUSY (irq_work busy). The kprobe program,
on the other hand, does not have this constraint.

Also add tests for illegal writes to task or vma from the callback
function. The verifier should reject both cases.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20211105232330.1936330-3-songliubraving@fb.com
---
 tools/testing/selftests/bpf/prog_tests/find_vma.c  | 117 +++++++++++++++++++++
 tools/testing/selftests/bpf/progs/find_vma.c       |  69 ++++++++++++
 tools/testing/selftests/bpf/progs/find_vma_fail1.c |  29 +++++
 tools/testing/selftests/bpf/progs/find_vma_fail2.c |  29 +++++
 4 files changed, 244 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/find_vma.c
 create mode 100644 tools/testing/selftests/bpf/progs/find_vma.c
 create mode 100644 tools/testing/selftests/bpf/progs/find_vma_fail1.c
 create mode 100644 tools/testing/selftests/bpf/progs/find_vma_fail2.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/find_vma.c b/tools/testing/selftests/bpf/prog_tests/find_vma.c
new file mode 100644
index 000000000000..b74b3c0c555a
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/find_vma.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include <test_progs.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include "find_vma.skel.h"
+#include "find_vma_fail1.skel.h"
+#include "find_vma_fail2.skel.h"
+
+static void test_and_reset_skel(struct find_vma *skel, int expected_find_zero_ret)
+{
+	ASSERT_EQ(skel->bss->found_vm_exec, 1, "found_vm_exec");
+	ASSERT_EQ(skel->data->find_addr_ret, 0, "find_addr_ret");
+	ASSERT_EQ(skel->data->find_zero_ret, expected_find_zero_ret, "find_zero_ret");
+	ASSERT_OK_PTR(strstr(skel->bss->d_iname, "test_progs"), "find_test_progs");
+
+	skel->bss->found_vm_exec = 0;
+	skel->data->find_addr_ret = -1;
+	skel->data->find_zero_ret = -1;
+	skel->bss->d_iname[0] = 0;
+}
+
+static int open_pe(void)
+{
+	struct perf_event_attr attr = {0};
+	int pfd;
+
+	/* create perf event */
+	attr.size = sizeof(attr);
+	attr.type = PERF_TYPE_HARDWARE;
+	attr.config = PERF_COUNT_HW_CPU_CYCLES;
+	attr.freq = 1;
+	attr.sample_freq = 4000;
+	pfd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, PERF_FLAG_FD_CLOEXEC);
+
+	return pfd >= 0 ? pfd : -errno;
+}
+
+static void test_find_vma_pe(struct find_vma *skel)
+{
+	struct bpf_link *link = NULL;
+	volatile int j = 0;
+	int pfd, i;
+
+	pfd = open_pe();
+	if (pfd < 0) {
+		if (pfd == -ENOENT || pfd == -EOPNOTSUPP) {
+			printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n", __func__);
+			test__skip();
+			goto cleanup;
+		}
+		if (!ASSERT_GE(pfd, 0, "perf_event_open"))
+			goto cleanup;
+	}
+
+	link = bpf_program__attach_perf_event(skel->progs.handle_pe, pfd);
+	if (!ASSERT_OK_PTR(link, "attach_perf_event"))
+		goto cleanup;
+
+	for (i = 0; i < 1000000; ++i)
+		++j;
+
+	test_and_reset_skel(skel, -EBUSY /* in nmi, irq_work is busy */);
+cleanup:
+	bpf_link__destroy(link);
+	close(pfd);
+}
+
+static void test_find_vma_kprobe(struct find_vma *skel)
+{
+	int err;
+
+	err = find_vma__attach(skel);
+	if (!ASSERT_OK(err, "get_branch_snapshot__attach"))
+		return;
+
+	getpgid(skel->bss->target_pid);
+	test_and_reset_skel(skel, -ENOENT /* could not find vma for ptr 0 */);
+}
+
+static void test_illegal_write_vma(void)
+{
+	struct find_vma_fail1 *skel;
+
+	skel = find_vma_fail1__open_and_load();
+	if (!ASSERT_ERR_PTR(skel, "find_vma_fail1__open_and_load"))
+		find_vma_fail1__destroy(skel);
+}
+
+static void test_illegal_write_task(void)
+{
+	struct find_vma_fail2 *skel;
+
+	skel = find_vma_fail2__open_and_load();
+	if (!ASSERT_ERR_PTR(skel, "find_vma_fail2__open_and_load"))
+		find_vma_fail2__destroy(skel);
+}
+
+void serial_test_find_vma(void)
+{
+	struct find_vma *skel;
+
+	skel = find_vma__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "find_vma__open_and_load"))
+		return;
+
+	skel->bss->target_pid = getpid();
+	skel->bss->addr = (__u64)(uintptr_t)test_find_vma_pe;
+
+	test_find_vma_pe(skel);
+	usleep(100000); /* allow the irq_work to finish */
+	test_find_vma_kprobe(skel);
+
+	find_vma__destroy(skel);
+	test_illegal_write_vma();
+	test_illegal_write_task();
+}
diff --git a/tools/testing/selftests/bpf/progs/find_vma.c b/tools/testing/selftests/bpf/progs/find_vma.c
new file mode 100644
index 000000000000..38034fb82530
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/find_vma.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct callback_ctx {
+	int dummy;
+};
+
+#define VM_EXEC		0x00000004
+#define DNAME_INLINE_LEN 32
+
+pid_t target_pid = 0;
+char d_iname[DNAME_INLINE_LEN] = {0};
+__u32 found_vm_exec = 0;
+__u64 addr = 0;
+int find_zero_ret = -1;
+int find_addr_ret = -1;
+
+static long check_vma(struct task_struct *task, struct vm_area_struct *vma,
+		      struct callback_ctx *data)
+{
+	if (vma->vm_file)
+		bpf_probe_read_kernel_str(d_iname, DNAME_INLINE_LEN - 1,
+					  vma->vm_file->f_path.dentry->d_iname);
+
+	/* check for VM_EXEC */
+	if (vma->vm_flags & VM_EXEC)
+		found_vm_exec = 1;
+
+	return 0;
+}
+
+SEC("raw_tp/sys_enter")
+int handle_getpid(void)
+{
+	struct task_struct *task = bpf_get_current_task_btf();
+	struct callback_ctx data = {};
+
+	if (task->pid != target_pid)
+		return 0;
+
+	find_addr_ret = bpf_find_vma(task, addr, check_vma, &data, 0);
+
+	/* this should return -ENOENT */
+	find_zero_ret = bpf_find_vma(task, 0, check_vma, &data, 0);
+	return 0;
+}
+
+SEC("perf_event")
+int handle_pe(void)
+{
+	struct task_struct *task = bpf_get_current_task_btf();
+	struct callback_ctx data = {};
+
+	if (task->pid != target_pid)
+		return 0;
+
+	find_addr_ret = bpf_find_vma(task, addr, check_vma, &data, 0);
+
+	/* In NMI, this should return -EBUSY, as the previous call is using
+	 * the irq_work.
+	 */
+	find_zero_ret = bpf_find_vma(task, 0, check_vma, &data, 0);
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/find_vma_fail1.c b/tools/testing/selftests/bpf/progs/find_vma_fail1.c
new file mode 100644
index 000000000000..b3b326b8e2d1
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/find_vma_fail1.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct callback_ctx {
+	int dummy;
+};
+
+static long write_vma(struct task_struct *task, struct vm_area_struct *vma,
+		      struct callback_ctx *data)
+{
+	/* writing to vma, which is illegal */
+	vma->vm_flags |= 0x55;
+
+	return 0;
+}
+
+SEC("raw_tp/sys_enter")
+int handle_getpid(void)
+{
+	struct task_struct *task = bpf_get_current_task_btf();
+	struct callback_ctx data = {};
+
+	bpf_find_vma(task, 0, write_vma, &data, 0);
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/find_vma_fail2.c b/tools/testing/selftests/bpf/progs/find_vma_fail2.c
new file mode 100644
index 000000000000..9bcf3203e26b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/find_vma_fail2.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct callback_ctx {
+	int dummy;
+};
+
+static long write_task(struct task_struct *task, struct vm_area_struct *vma,
+		       struct callback_ctx *data)
+{
+	/* writing to task, which is illegal */
+	task->mm = NULL;
+
+	return 0;
+}
+
+SEC("raw_tp/sys_enter")
+int handle_getpid(void)
+{
+	struct task_struct *task = bpf_get_current_task_btf();
+	struct callback_ctx data = {};
+
+	bpf_find_vma(task, 0, write_task, &data, 0);
+	return 0;
+}
-- 
cgit v1.2.3


From 33b8aad21ac175eba9577a73eb62b0aa141c241c Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 18 Oct 2021 14:38:13 +0200
Subject: selftests: netfilter: add a vrf+conntrack testcase

Rework the reproducer for the vrf+conntrack regression reported
by Eugene into a selftest and also add a test for ip masquerading
that Lahav fixed recently.

With net or net-next tree, the first test fails and the latter
two pass.

With 09e856d54bda5f28 ("vrf: Reset skb conntrack connection on VRF rcv")
reverted first test passes but the last two fail.

A proper fix needs more work, for time being a revert seems to be
the best choice, snat/masquerade did not work before the fix.

Link: https://lore.kernel.org/netdev/378ca299-4474-7e9a-3d36-2350c8c98995@gmail.com/T/#m95358a31810df7392f541f99d187227bc75c9963
Reported-by: Eugene Crosser <crosser@average.org>
Cc: Lahav Schlesinger <lschlesinger@drivenets.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 tools/testing/selftests/netfilter/Makefile         |   3 +-
 tools/testing/selftests/netfilter/conntrack_vrf.sh | 219 +++++++++++++++++++++
 2 files changed, 221 insertions(+), 1 deletion(-)
 create mode 100755 tools/testing/selftests/netfilter/conntrack_vrf.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile
index 8748199ac109..ffca314897c4 100644
--- a/tools/testing/selftests/netfilter/Makefile
+++ b/tools/testing/selftests/netfilter/Makefile
@@ -5,7 +5,8 @@ TEST_PROGS := nft_trans_stress.sh nft_fib.sh nft_nat.sh bridge_brouter.sh \
 	conntrack_icmp_related.sh nft_flowtable.sh ipvs.sh \
 	nft_concat_range.sh nft_conntrack_helper.sh \
 	nft_queue.sh nft_meta.sh nf_nat_edemux.sh \
-	ipip-conntrack-mtu.sh conntrack_tcp_unreplied.sh
+	ipip-conntrack-mtu.sh conntrack_tcp_unreplied.sh \
+	conntrack_vrf.sh
 
 LDLIBS = -lmnl
 TEST_GEN_FILES =  nf-queue
diff --git a/tools/testing/selftests/netfilter/conntrack_vrf.sh b/tools/testing/selftests/netfilter/conntrack_vrf.sh
new file mode 100755
index 000000000000..91f3ef0f1192
--- /dev/null
+++ b/tools/testing/selftests/netfilter/conntrack_vrf.sh
@@ -0,0 +1,219 @@
+#!/bin/sh
+
+# This script demonstrates interaction of conntrack and vrf.
+# The vrf driver calls the netfilter hooks again, with oif/iif
+# pointing at the VRF device.
+#
+# For ingress, this means first iteration has iifname of lower/real
+# device.  In this script, thats veth0.
+# Second iteration is iifname set to vrf device, tvrf in this script.
+#
+# For egress, this is reversed: first iteration has the vrf device,
+# second iteration is done with the lower/real/veth0 device.
+#
+# test_ct_zone_in demonstrates unexpected change of nftables
+# behavior # caused by commit 09e856d54bda5f28 "vrf: Reset skb conntrack
+# connection on VRF rcv"
+#
+# It was possible to assign conntrack zone to a packet (or mark it for
+# `notracking`) in the prerouting chain before conntrack, based on real iif.
+#
+# After the change, the zone assignment is lost and the zone is assigned based
+# on the VRF master interface (in case such a rule exists).
+# assignment is lost. Instead, assignment based on the `iif` matching
+# Thus it is impossible to distinguish packets based on the original
+# interface.
+#
+# test_masquerade_vrf and test_masquerade_veth0 demonstrate the problem
+# that was supposed to be fixed by the commit mentioned above to make sure
+# that any fix to test case 1 won't break masquerade again.
+
+ksft_skip=4
+
+IP0=172.30.30.1
+IP1=172.30.30.2
+PFXL=30
+ret=0
+
+sfx=$(mktemp -u "XXXXXXXX")
+ns0="ns0-$sfx"
+ns1="ns1-$sfx"
+
+cleanup()
+{
+	ip netns pids $ns0 | xargs kill 2>/dev/null
+	ip netns pids $ns1 | xargs kill 2>/dev/null
+
+	ip netns del $ns0 $ns1
+}
+
+nft --version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+	echo "SKIP: Could not run test without nft tool"
+	exit $ksft_skip
+fi
+
+ip -Version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+	echo "SKIP: Could not run test without ip tool"
+	exit $ksft_skip
+fi
+
+ip netns add "$ns0"
+if [ $? -ne 0 ];then
+	echo "SKIP: Could not create net namespace $ns0"
+	exit $ksft_skip
+fi
+ip netns add "$ns1"
+
+trap cleanup EXIT
+
+ip netns exec $ns0 sysctl -q -w net.ipv4.conf.default.rp_filter=0
+ip netns exec $ns0 sysctl -q -w net.ipv4.conf.all.rp_filter=0
+ip netns exec $ns0 sysctl -q -w net.ipv4.conf.all.rp_filter=0
+
+ip link add veth0 netns "$ns0" type veth peer name veth0 netns "$ns1" > /dev/null 2>&1
+if [ $? -ne 0 ];then
+	echo "SKIP: Could not add veth device"
+	exit $ksft_skip
+fi
+
+ip -net $ns0 li add tvrf type vrf table 9876
+if [ $? -ne 0 ];then
+	echo "SKIP: Could not add vrf device"
+	exit $ksft_skip
+fi
+
+ip -net $ns0 li set lo up
+
+ip -net $ns0 li set veth0 master tvrf
+ip -net $ns0 li set tvrf up
+ip -net $ns0 li set veth0 up
+ip -net $ns1 li set veth0 up
+
+ip -net $ns0 addr add $IP0/$PFXL dev veth0
+ip -net $ns1 addr add $IP1/$PFXL dev veth0
+
+ip netns exec $ns1 iperf3 -s > /dev/null 2>&1&
+if [ $? -ne 0 ];then
+	echo "SKIP: Could not start iperf3"
+	exit $ksft_skip
+fi
+
+# test vrf ingress handling.
+# The incoming connection should be placed in conntrack zone 1,
+# as decided by the first iteration of the ruleset.
+test_ct_zone_in()
+{
+ip netns exec $ns0 nft -f - <<EOF
+table testct {
+	chain rawpre {
+		type filter hook prerouting priority raw;
+
+		iif { veth0, tvrf } counter meta nftrace set 1
+		iif veth0 counter ct zone set 1 counter return
+		iif tvrf counter ct zone set 2 counter return
+		ip protocol icmp counter
+		notrack counter
+	}
+
+	chain rawout {
+		type filter hook output priority raw;
+
+		oif veth0 counter ct zone set 1 counter return
+		oif tvrf counter ct zone set 2 counter return
+		notrack counter
+	}
+}
+EOF
+	ip netns exec $ns1 ping -W 1 -c 1 -I veth0 $IP0 > /dev/null
+
+	# should be in zone 1, not zone 2
+	count=$(ip netns exec $ns0 conntrack -L -s $IP1 -d $IP0 -p icmp --zone 1 2>/dev/null | wc -l)
+	if [ $count -eq 1 ]; then
+		echo "PASS: entry found in conntrack zone 1"
+	else
+		echo "FAIL: entry not found in conntrack zone 1"
+		count=$(ip netns exec $ns0 conntrack -L -s $IP1 -d $IP0 -p icmp --zone 2 2> /dev/null | wc -l)
+		if [ $count -eq 1 ]; then
+			echo "FAIL: entry found in zone 2 instead"
+		else
+			echo "FAIL: entry not in zone 1 or 2, dumping table"
+			ip netns exec $ns0 conntrack -L
+			ip netns exec $ns0 nft list ruleset
+		fi
+	fi
+}
+
+# add masq rule that gets evaluated w. outif set to vrf device.
+# This tests the first iteration of the packet through conntrack,
+# oifname is the vrf device.
+test_masquerade_vrf()
+{
+	ip netns exec $ns0 conntrack -F 2>/dev/null
+
+ip netns exec $ns0 nft -f - <<EOF
+flush ruleset
+table ip nat {
+	chain postrouting {
+		type nat hook postrouting priority 0;
+		# NB: masquerade should always be combined with 'oif(name) bla',
+		# lack of this is intentional here, we want to exercise double-snat.
+		ip saddr 172.30.30.0/30 counter masquerade random
+	}
+}
+EOF
+	ip netns exec $ns0 ip vrf exec tvrf iperf3 -t 1 -c $IP1 >/dev/null
+	if [ $? -ne 0 ]; then
+		echo "FAIL: iperf3 connect failure with masquerade + sport rewrite on vrf device"
+		ret=1
+		return
+	fi
+
+	# must also check that nat table was evaluated on second (lower device) iteration.
+	ip netns exec $ns0 nft list table ip nat |grep -q 'counter packets 2'
+	if [ $? -eq 0 ]; then
+		echo "PASS: iperf3 connect with masquerade + sport rewrite on vrf device"
+	else
+		echo "FAIL: vrf masq rule has unexpected counter value"
+		ret=1
+	fi
+}
+
+# add masq rule that gets evaluated w. outif set to veth device.
+# This tests the 2nd iteration of the packet through conntrack,
+# oifname is the lower device (veth0 in this case).
+test_masquerade_veth()
+{
+	ip netns exec $ns0 conntrack -F 2>/dev/null
+ip netns exec $ns0 nft -f - <<EOF
+flush ruleset
+table ip nat {
+	chain postrouting {
+		type nat hook postrouting priority 0;
+		meta oif veth0 ip saddr 172.30.30.0/30 counter masquerade random
+	}
+}
+EOF
+	ip netns exec $ns0 ip vrf exec tvrf iperf3 -t 1 -c $IP1 > /dev/null
+	if [ $? -ne 0 ]; then
+		echo "FAIL: iperf3 connect failure with masquerade + sport rewrite on veth device"
+		ret=1
+		return
+	fi
+
+	# must also check that nat table was evaluated on second (lower device) iteration.
+	ip netns exec $ns0 nft list table ip nat |grep -q 'counter packets 2'
+	if [ $? -eq 0 ]; then
+		echo "PASS: iperf3 connect with masquerade + sport rewrite on veth device"
+	else
+		echo "FAIL: vrf masq rule has unexpected counter value"
+		ret=1
+	fi
+}
+
+test_ct_zone_in
+test_masquerade_vrf
+test_masquerade_veth
+
+exit $ret
-- 
cgit v1.2.3


From 228c3fa054ad118f78fc60f1a3f05ba29b043608 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 20 Oct 2021 18:25:37 +0200
Subject: selftests: netfilter: extend nfqueue tests to cover vrf device

VRF device calls the output/postrouting hooks so packet should be seeon
with oifname tvrf and once with eth0.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 tools/testing/selftests/netfilter/nft_queue.sh | 54 ++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/netfilter/nft_queue.sh b/tools/testing/selftests/netfilter/nft_queue.sh
index 3d202b90b33d..7d27f1f3bc01 100755
--- a/tools/testing/selftests/netfilter/nft_queue.sh
+++ b/tools/testing/selftests/netfilter/nft_queue.sh
@@ -16,6 +16,10 @@ timeout=4
 
 cleanup()
 {
+	ip netns pids ${ns1} | xargs kill 2>/dev/null
+	ip netns pids ${ns2} | xargs kill 2>/dev/null
+	ip netns pids ${nsrouter} | xargs kill 2>/dev/null
+
 	ip netns del ${ns1}
 	ip netns del ${ns2}
 	ip netns del ${nsrouter}
@@ -332,6 +336,55 @@ EOF
 	echo "PASS: tcp via loopback and re-queueing"
 }
 
+test_icmp_vrf() {
+	ip -net $ns1 link add tvrf type vrf table 9876
+	if [ $? -ne 0 ];then
+		echo "SKIP: Could not add vrf device"
+		return
+	fi
+
+	ip -net $ns1 li set eth0 master tvrf
+	ip -net $ns1 li set tvrf up
+
+	ip -net $ns1 route add 10.0.2.0/24 via 10.0.1.1 dev eth0 table 9876
+ip netns exec ${ns1} nft -f /dev/stdin <<EOF
+flush ruleset
+table inet filter {
+	chain output {
+		type filter hook output priority 0; policy accept;
+		meta oifname "tvrf" icmp type echo-request counter queue num 1
+		meta oifname "eth0" icmp type echo-request counter queue num 1
+	}
+	chain post {
+		type filter hook postrouting priority 0; policy accept;
+		meta oifname "tvrf" icmp type echo-request counter queue num 1
+		meta oifname "eth0" icmp type echo-request counter queue num 1
+	}
+}
+EOF
+	ip netns exec ${ns1} ./nf-queue -q 1 -t $timeout &
+	local nfqpid=$!
+
+	sleep 1
+	ip netns exec ${ns1} ip vrf exec tvrf ping -c 1 10.0.2.99 > /dev/null
+
+	for n in output post; do
+		for d in tvrf eth0; do
+			ip netns exec ${ns1} nft list chain inet filter $n | grep -q "oifname \"$d\" icmp type echo-request counter packets 1"
+			if [ $? -ne 0 ] ; then
+				echo "FAIL: chain $n: icmp packet counter mismatch for device $d" 1>&2
+				ip netns exec ${ns1} nft list ruleset
+				ret=1
+				return
+			fi
+		done
+	done
+
+	wait $nfqpid
+	[ $? -eq 0 ] && echo "PASS: icmp+nfqueue via vrf"
+	wait 2>/dev/null
+}
+
 ip netns exec ${nsrouter} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
 ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
 ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
@@ -372,5 +425,6 @@ test_queue 20
 test_tcp_forward
 test_tcp_localhost
 test_tcp_localhost_requeue
+test_icmp_vrf
 
 exit $ret
-- 
cgit v1.2.3


From e1f8bc06e49794ebecc0f276429a7ab9f105ac06 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Wed, 3 Nov 2021 19:53:42 +0100
Subject: selftests: nft_nat: Improve port shadow test stability

Setup phase in test_port_shadow() relied upon a race-condition:
Listening nc on port 1405 was started in background before attempting to
create the fake conntrack entry using the same source port. If listening
nc won, fake conntrack entry could not be created causing wrong
behaviour. Reorder nc calls to fix this and introduce a short delay
before testing the setup to wait for listening nc process startup.

Fixes: 465f15a6d1a8f ("selftests: nft_nat: add udp hole punch test case")
Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 tools/testing/selftests/netfilter/nft_nat.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/netfilter/nft_nat.sh b/tools/testing/selftests/netfilter/nft_nat.sh
index da1c1e4b6c86..905c033db74d 100755
--- a/tools/testing/selftests/netfilter/nft_nat.sh
+++ b/tools/testing/selftests/netfilter/nft_nat.sh
@@ -759,14 +759,16 @@ test_port_shadow()
 	local result=""
 	local logmsg=""
 
+	# make shadow entry, from client (ns2), going to (ns1), port 41404, sport 1405.
+	echo "fake-entry" | ip netns exec "$ns2" nc -w 1 -p 1405 -u "$daddrc" 41404 > /dev/null
+
 	echo ROUTER | ip netns exec "$ns0" nc -w 5 -u -l -p 1405 >/dev/null 2>&1 &
 	nc_r=$!
 
 	echo CLIENT | ip netns exec "$ns2" nc -w 5 -u -l -p 1405 >/dev/null 2>&1 &
 	nc_c=$!
 
-	# make shadow entry, from client (ns2), going to (ns1), port 41404, sport 1405.
-	echo "fake-entry" | ip netns exec "$ns2" nc -w 1 -p 1405 -u "$daddrc" 41404 > /dev/null
+	sleep 0.3
 
 	# ns1 tries to connect to ns0:1405.  With default settings this should connect
 	# to client, it matches the conntrack entry created above.
-- 
cgit v1.2.3


From 85c0c8b342e89761554eed3f572ee55c0e5c6536 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Wed, 3 Nov 2021 19:53:43 +0100
Subject: selftests: nft_nat: Simplify port shadow notrack test

The second rule in prerouting chain was probably a leftover: The router
listens on veth0, so not tracking connections via that interface is
sufficient. Likewise, the rule in output chain can be limited to that
interface as well.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 tools/testing/selftests/netfilter/nft_nat.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/netfilter/nft_nat.sh b/tools/testing/selftests/netfilter/nft_nat.sh
index 905c033db74d..c62e4e26252c 100755
--- a/tools/testing/selftests/netfilter/nft_nat.sh
+++ b/tools/testing/selftests/netfilter/nft_nat.sh
@@ -818,11 +818,10 @@ table $family raw {
 	chain prerouting {
 		type filter hook prerouting priority -300; policy accept;
 		meta iif veth0 udp dport 1405 notrack
-		udp dport 1405 notrack
 	}
 	chain output {
 		type filter hook output priority -300; policy accept;
-		udp sport 1405 notrack
+		meta oif veth0 udp sport 1405 notrack
 	}
 }
 EOF
-- 
cgit v1.2.3


From c23551c9c36ae394f9c53a5adf1944a943c65e0b Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Fri, 5 Nov 2021 16:50:46 +0000
Subject: selftests/bpf: Add exception handling selftests for tp_bpf program

Exception handling is triggered in BPF tracing programs when a NULL pointer
is dereferenced; the exception handler zeroes the target register and
execution of the BPF program progresses.

To test exception handling then, we need to trigger a NULL pointer dereference
for a field which should never be zero; if it is, the only explanation is the
exception handler ran. task->task_works is the NULL pointer chosen (for a new
task from fork() no work is associated), and the task_works->func field should
not be zero if task_works is non-NULL. The test verifies that task_works and
task_works->func are 0.

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1636131046-5982-3-git-send-email-alan.maguire@oracle.com
---
 tools/testing/selftests/bpf/prog_tests/exhandler.c | 43 ++++++++++++++++++++++
 tools/testing/selftests/bpf/progs/exhandler_kern.c | 43 ++++++++++++++++++++++
 2 files changed, 86 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/exhandler.c
 create mode 100644 tools/testing/selftests/bpf/progs/exhandler_kern.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/exhandler.c b/tools/testing/selftests/bpf/prog_tests/exhandler.c
new file mode 100644
index 000000000000..118bb182ee20
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/exhandler.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021, Oracle and/or its affiliates. */
+
+#include <test_progs.h>
+
+/* Test that verifies exception handling is working. fork()
+ * triggers task_newtask tracepoint; that new task will have a
+ * NULL pointer task_works, and the associated task->task_works->func
+ * should not be NULL if task_works itself is non-NULL.
+ *
+ * So to verify exception handling we want to see a NULL task_works
+ * and task_works->func; if we see this we can conclude that the
+ * exception handler ran when we attempted to dereference task->task_works
+ * and zeroed the destination register.
+ */
+#include "exhandler_kern.skel.h"
+
+void test_exhandler(void)
+{
+	int err = 0, duration = 0, status;
+	struct exhandler_kern *skel;
+	pid_t cpid;
+
+	skel = exhandler_kern__open_and_load();
+	if (CHECK(!skel, "skel_load", "skeleton failed: %d\n", err))
+		goto cleanup;
+
+	skel->bss->test_pid = getpid();
+
+	err = exhandler_kern__attach(skel);
+	if (!ASSERT_OK(err, "attach"))
+		goto cleanup;
+	cpid = fork();
+	if (!ASSERT_GT(cpid, -1, "fork failed"))
+		goto cleanup;
+	if (cpid == 0)
+		_exit(0);
+	waitpid(cpid, &status, 0);
+
+	ASSERT_NEQ(skel->bss->exception_triggered, 0, "verify exceptions occurred");
+cleanup:
+	exhandler_kern__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/exhandler_kern.c b/tools/testing/selftests/bpf/progs/exhandler_kern.c
new file mode 100644
index 000000000000..f5ca142abf8f
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/exhandler_kern.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021, Oracle and/or its affiliates. */
+
+#include "vmlinux.h"
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+char _license[] SEC("license") = "GPL";
+
+unsigned int exception_triggered;
+int test_pid;
+
+/* TRACE_EVENT(task_newtask,
+ *         TP_PROTO(struct task_struct *p, u64 clone_flags)
+ */
+SEC("tp_btf/task_newtask")
+int BPF_PROG(trace_task_newtask, struct task_struct *task, u64 clone_flags)
+{
+	int pid = bpf_get_current_pid_tgid() >> 32;
+	struct callback_head *work;
+	void *func;
+
+	if (test_pid != pid)
+		return 0;
+
+	/* To verify we hit an exception we dereference task->task_works->func.
+	 * If task work has been added,
+	 * - task->task_works is non-NULL; and
+	 * - task->task_works->func is non-NULL also (the callback function
+	 *   must be specified for the task work.
+	 *
+	 * However, for a newly-created task, task->task_works is NULLed,
+	 * so we know the exception handler triggered if task_works is
+	 * NULL and func is NULL.
+	 */
+	work = task->task_works;
+	func = work->func;
+	if (!work && !func)
+		exception_triggered++;
+	return 0;
+}
-- 
cgit v1.2.3


From 3a74ac2d1159716f35c944639f71b33fa16084c8 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 6 Nov 2021 05:12:40 +0530
Subject: libbpf: Compile using -std=gnu89

The minimum supported C standard version is C89, with use of GNU
extensions, hence make sure to catch any instances that would break
the build for this mode by passing -std=gnu89.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211105234243.390179-4-memxor@gmail.com
---
 tools/lib/bpf/Makefile | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index b393b5e82380..5f7086fae31c 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -84,6 +84,7 @@ else
 endif
 
 # Append required CFLAGS
+override CFLAGS += -std=gnu89
 override CFLAGS += $(EXTRA_WARNINGS) -Wno-switch-enum
 override CFLAGS += -Werror -Wall
 override CFLAGS += $(INCLUDES)
-- 
cgit v1.2.3


From 1a8b597ddabe7dc25aa9defd33949d455ee9cde8 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin@isovalent.com>
Date: Fri, 5 Nov 2021 22:19:04 +0000
Subject: bpftool: Fix SPDX tag for Makefiles and .gitignore

Bpftool is dual-licensed under GPLv2 and BSD-2-Clause. In commit
907b22365115 ("tools: bpftool: dual license all files") we made sure
that all its source files were indeed covered by the two licenses, and
that they had the correct SPDX tags.

However, bpftool's Makefile, the Makefile for its documentation, and the
.gitignore file were skipped at the time (their GPL-2.0-only tag was
added later). Let's update the tags.

Signed-off-by: Quentin Monnet <quentin@isovalent.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Tobias Klauser <tklauser@distanz.ch>
Acked-by: Joe Stringer <joe@cilium.io>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/bpf/20211105221904.3536-1-quentin@isovalent.com
---
 tools/bpf/bpftool/.gitignore             | 2 +-
 tools/bpf/bpftool/Documentation/Makefile | 2 +-
 tools/bpf/bpftool/Makefile               | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/.gitignore b/tools/bpf/bpftool/.gitignore
index 05ce4446b780..a736f64dc5dc 100644
--- a/tools/bpf/bpftool/.gitignore
+++ b/tools/bpf/bpftool/.gitignore
@@ -1,4 +1,4 @@
-# SPDX-License-Identifier: GPL-2.0-only
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
 *.d
 /bootstrap/
 /bpftool
diff --git a/tools/bpf/bpftool/Documentation/Makefile b/tools/bpf/bpftool/Documentation/Makefile
index c49487905ceb..44b60784847b 100644
--- a/tools/bpf/bpftool/Documentation/Makefile
+++ b/tools/bpf/bpftool/Documentation/Makefile
@@ -1,4 +1,4 @@
-# SPDX-License-Identifier: GPL-2.0-only
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
 include ../../../scripts/Makefile.include
 include ../../../scripts/utilities.mak
 
diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index c0c30e56988f..622568c7a9b8 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -1,4 +1,4 @@
-# SPDX-License-Identifier: GPL-2.0-only
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
 include ../../scripts/Makefile.include
 include ../../scripts/utilities.mak
 
-- 
cgit v1.2.3


From f89315650ba34ec6c91a8bded72796980bee2a4d Mon Sep 17 00:00:00 2001
From: Mark Pashmfouroush <markpash@cloudflare.com>
Date: Wed, 10 Nov 2021 11:10:15 +0000
Subject: bpf: Add ingress_ifindex to bpf_sk_lookup

It may be helpful to have access to the ifindex during bpf socket
lookup. An example may be to scope certain socket lookup logic to
specific interfaces, i.e. an interface may be made exempt from custom
lookup code.

Add the ifindex of the arriving connection to the bpf_sk_lookup API.

Signed-off-by: Mark Pashmfouroush <markpash@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211110111016.5670-2-markpash@cloudflare.com
---
 include/linux/filter.h         | 7 +++++--
 include/uapi/linux/bpf.h       | 1 +
 net/core/filter.c              | 7 +++++++
 net/ipv4/inet_hashtables.c     | 8 ++++----
 net/ipv4/udp.c                 | 8 ++++----
 net/ipv6/inet6_hashtables.c    | 8 ++++----
 net/ipv6/udp.c                 | 8 ++++----
 tools/include/uapi/linux/bpf.h | 1 +
 8 files changed, 30 insertions(+), 18 deletions(-)

(limited to 'tools')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 24b7ed2677af..b6a216eb217a 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1374,6 +1374,7 @@ struct bpf_sk_lookup_kern {
 		const struct in6_addr *daddr;
 	} v6;
 	struct sock	*selected_sk;
+	u32		ingress_ifindex;
 	bool		no_reuseport;
 };
 
@@ -1436,7 +1437,7 @@ extern struct static_key_false bpf_sk_lookup_enabled;
 static inline bool bpf_sk_lookup_run_v4(struct net *net, int protocol,
 					const __be32 saddr, const __be16 sport,
 					const __be32 daddr, const u16 dport,
-					struct sock **psk)
+					const int ifindex, struct sock **psk)
 {
 	struct bpf_prog_array *run_array;
 	struct sock *selected_sk = NULL;
@@ -1452,6 +1453,7 @@ static inline bool bpf_sk_lookup_run_v4(struct net *net, int protocol,
 			.v4.daddr	= daddr,
 			.sport		= sport,
 			.dport		= dport,
+			.ingress_ifindex	= ifindex,
 		};
 		u32 act;
 
@@ -1474,7 +1476,7 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol,
 					const __be16 sport,
 					const struct in6_addr *daddr,
 					const u16 dport,
-					struct sock **psk)
+					const int ifindex, struct sock **psk)
 {
 	struct bpf_prog_array *run_array;
 	struct sock *selected_sk = NULL;
@@ -1490,6 +1492,7 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol,
 			.v6.daddr	= daddr,
 			.sport		= sport,
 			.dport		= dport,
+			.ingress_ifindex	= ifindex,
 		};
 		u32 act;
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 509eee5f0393..6297eafdc40f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6316,6 +6316,7 @@ struct bpf_sk_lookup {
 	__u32 local_ip4;	/* Network byte order */
 	__u32 local_ip6[4];	/* Network byte order */
 	__u32 local_port;	/* Host byte order */
+	__u32 ingress_ifindex;		/* The arriving interface. Determined by inet_iif. */
 };
 
 /*
diff --git a/net/core/filter.c b/net/core/filter.c
index 8e8d3b49c297..315a58466fc9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -10491,6 +10491,7 @@ static bool sk_lookup_is_valid_access(int off, int size,
 	case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]):
 	case bpf_ctx_range(struct bpf_sk_lookup, remote_port):
 	case bpf_ctx_range(struct bpf_sk_lookup, local_port):
+	case bpf_ctx_range(struct bpf_sk_lookup, ingress_ifindex):
 		bpf_ctx_record_field_size(info, sizeof(__u32));
 		return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32));
 
@@ -10580,6 +10581,12 @@ static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type,
 				      bpf_target_off(struct bpf_sk_lookup_kern,
 						     dport, 2, target_size));
 		break;
+
+	case offsetof(struct bpf_sk_lookup, ingress_ifindex):
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+				      bpf_target_off(struct bpf_sk_lookup_kern,
+						     ingress_ifindex, 4, target_size));
+		break;
 	}
 
 	return insn - insn_buf;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 75737267746f..30ab717ff1b8 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -307,7 +307,7 @@ static inline struct sock *inet_lookup_run_bpf(struct net *net,
 					       struct inet_hashinfo *hashinfo,
 					       struct sk_buff *skb, int doff,
 					       __be32 saddr, __be16 sport,
-					       __be32 daddr, u16 hnum)
+					       __be32 daddr, u16 hnum, const int dif)
 {
 	struct sock *sk, *reuse_sk;
 	bool no_reuseport;
@@ -315,8 +315,8 @@ static inline struct sock *inet_lookup_run_bpf(struct net *net,
 	if (hashinfo != &tcp_hashinfo)
 		return NULL; /* only TCP is supported */
 
-	no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_TCP,
-					    saddr, sport, daddr, hnum, &sk);
+	no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_TCP, saddr, sport,
+					    daddr, hnum, dif, &sk);
 	if (no_reuseport || IS_ERR_OR_NULL(sk))
 		return sk;
 
@@ -340,7 +340,7 @@ struct sock *__inet_lookup_listener(struct net *net,
 	/* Lookup redirect from BPF */
 	if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
 		result = inet_lookup_run_bpf(net, hashinfo, skb, doff,
-					     saddr, sport, daddr, hnum);
+					     saddr, sport, daddr, hnum, dif);
 		if (result)
 			goto done;
 	}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2fffcf2b54f3..5fceee3de65d 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -460,7 +460,7 @@ static struct sock *udp4_lookup_run_bpf(struct net *net,
 					struct udp_table *udptable,
 					struct sk_buff *skb,
 					__be32 saddr, __be16 sport,
-					__be32 daddr, u16 hnum)
+					__be32 daddr, u16 hnum, const int dif)
 {
 	struct sock *sk, *reuse_sk;
 	bool no_reuseport;
@@ -468,8 +468,8 @@ static struct sock *udp4_lookup_run_bpf(struct net *net,
 	if (udptable != &udp_table)
 		return NULL; /* only UDP is supported */
 
-	no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_UDP,
-					    saddr, sport, daddr, hnum, &sk);
+	no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_UDP, saddr, sport,
+					    daddr, hnum, dif, &sk);
 	if (no_reuseport || IS_ERR_OR_NULL(sk))
 		return sk;
 
@@ -505,7 +505,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 	/* Lookup redirect from BPF */
 	if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
 		sk = udp4_lookup_run_bpf(net, udptable, skb,
-					 saddr, sport, daddr, hnum);
+					 saddr, sport, daddr, hnum, dif);
 		if (sk) {
 			result = sk;
 			goto done;
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 67c9114835c8..4514444e96c8 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -165,7 +165,7 @@ static inline struct sock *inet6_lookup_run_bpf(struct net *net,
 						const struct in6_addr *saddr,
 						const __be16 sport,
 						const struct in6_addr *daddr,
-						const u16 hnum)
+						const u16 hnum, const int dif)
 {
 	struct sock *sk, *reuse_sk;
 	bool no_reuseport;
@@ -173,8 +173,8 @@ static inline struct sock *inet6_lookup_run_bpf(struct net *net,
 	if (hashinfo != &tcp_hashinfo)
 		return NULL; /* only TCP is supported */
 
-	no_reuseport = bpf_sk_lookup_run_v6(net, IPPROTO_TCP,
-					    saddr, sport, daddr, hnum, &sk);
+	no_reuseport = bpf_sk_lookup_run_v6(net, IPPROTO_TCP, saddr, sport,
+					    daddr, hnum, dif, &sk);
 	if (no_reuseport || IS_ERR_OR_NULL(sk))
 		return sk;
 
@@ -198,7 +198,7 @@ struct sock *inet6_lookup_listener(struct net *net,
 	/* Lookup redirect from BPF */
 	if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
 		result = inet6_lookup_run_bpf(net, hashinfo, skb, doff,
-					      saddr, sport, daddr, hnum);
+					      saddr, sport, daddr, hnum, dif);
 		if (result)
 			goto done;
 	}
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 12c12619ee35..ea4ea525f94a 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -195,7 +195,7 @@ static inline struct sock *udp6_lookup_run_bpf(struct net *net,
 					       const struct in6_addr *saddr,
 					       __be16 sport,
 					       const struct in6_addr *daddr,
-					       u16 hnum)
+					       u16 hnum, const int dif)
 {
 	struct sock *sk, *reuse_sk;
 	bool no_reuseport;
@@ -203,8 +203,8 @@ static inline struct sock *udp6_lookup_run_bpf(struct net *net,
 	if (udptable != &udp_table)
 		return NULL; /* only UDP is supported */
 
-	no_reuseport = bpf_sk_lookup_run_v6(net, IPPROTO_UDP,
-					    saddr, sport, daddr, hnum, &sk);
+	no_reuseport = bpf_sk_lookup_run_v6(net, IPPROTO_UDP, saddr, sport,
+					    daddr, hnum, dif, &sk);
 	if (no_reuseport || IS_ERR_OR_NULL(sk))
 		return sk;
 
@@ -240,7 +240,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
 	/* Lookup redirect from BPF */
 	if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
 		sk = udp6_lookup_run_bpf(net, udptable, skb,
-					 saddr, sport, daddr, hnum);
+					 saddr, sport, daddr, hnum, dif);
 		if (sk) {
 			result = sk;
 			goto done;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 509eee5f0393..6297eafdc40f 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6316,6 +6316,7 @@ struct bpf_sk_lookup {
 	__u32 local_ip4;	/* Network byte order */
 	__u32 local_ip6[4];	/* Network byte order */
 	__u32 local_port;	/* Host byte order */
+	__u32 ingress_ifindex;		/* The arriving interface. Determined by inet_iif. */
 };
 
 /*
-- 
cgit v1.2.3


From 8b4fd2bf1f47c3e3a63c327fca2ad5c4e2691ef8 Mon Sep 17 00:00:00 2001
From: Mark Pashmfouroush <markpash@cloudflare.com>
Date: Wed, 10 Nov 2021 11:10:16 +0000
Subject: selftests/bpf: Add tests for accessing ingress_ifindex in
 bpf_sk_lookup

A new field was added to the bpf_sk_lookup data that users can access.
Add tests that validate that the new ingress_ifindex field contains the
right data.

Signed-off-by: Mark Pashmfouroush <markpash@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211110111016.5670-3-markpash@cloudflare.com
---
 tools/testing/selftests/bpf/prog_tests/sk_lookup.c | 31 +++++++++++++++++++++
 tools/testing/selftests/bpf/progs/test_sk_lookup.c |  8 ++++++
 .../testing/selftests/bpf/verifier/ctx_sk_lookup.c | 32 ++++++++++++++++++++++
 3 files changed, 71 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
index 6db07401bc49..57846cc7ce36 100644
--- a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
+++ b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
@@ -937,6 +937,37 @@ static void test_drop_on_lookup(struct test_sk_lookup *skel)
 			.connect_to	= { EXT_IP6, EXT_PORT },
 			.listen_at	= { EXT_IP6, INT_PORT },
 		},
+		/* The program will drop on success, meaning that the ifindex
+		 * was 1.
+		 */
+		{
+			.desc		= "TCP IPv4 drop on valid ifindex",
+			.lookup_prog	= skel->progs.check_ifindex,
+			.sotype		= SOCK_STREAM,
+			.connect_to	= { EXT_IP4, EXT_PORT },
+			.listen_at	= { EXT_IP4, EXT_PORT },
+		},
+		{
+			.desc		= "TCP IPv6 drop on valid ifindex",
+			.lookup_prog	= skel->progs.check_ifindex,
+			.sotype		= SOCK_STREAM,
+			.connect_to	= { EXT_IP6, EXT_PORT },
+			.listen_at	= { EXT_IP6, EXT_PORT },
+		},
+		{
+			.desc		= "UDP IPv4 drop on valid ifindex",
+			.lookup_prog	= skel->progs.check_ifindex,
+			.sotype		= SOCK_DGRAM,
+			.connect_to	= { EXT_IP4, EXT_PORT },
+			.listen_at	= { EXT_IP4, EXT_PORT },
+		},
+		{
+			.desc		= "UDP IPv6 drop on valid ifindex",
+			.lookup_prog	= skel->progs.check_ifindex,
+			.sotype		= SOCK_DGRAM,
+			.connect_to	= { EXT_IP6, EXT_PORT },
+			.listen_at	= { EXT_IP6, EXT_PORT },
+		},
 	};
 	const struct test *t;
 
diff --git a/tools/testing/selftests/bpf/progs/test_sk_lookup.c b/tools/testing/selftests/bpf/progs/test_sk_lookup.c
index 19d2465d9442..83b0aaa52ef7 100644
--- a/tools/testing/selftests/bpf/progs/test_sk_lookup.c
+++ b/tools/testing/selftests/bpf/progs/test_sk_lookup.c
@@ -84,6 +84,14 @@ int lookup_drop(struct bpf_sk_lookup *ctx)
 	return SK_DROP;
 }
 
+SEC("sk_lookup")
+int check_ifindex(struct bpf_sk_lookup *ctx)
+{
+	if (ctx->ingress_ifindex == 1)
+		return SK_DROP;
+	return SK_PASS;
+}
+
 SEC("sk_reuseport")
 int reuseport_pass(struct sk_reuseport_md *ctx)
 {
diff --git a/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c b/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c
index d78627be060f..a2b006e2fd06 100644
--- a/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c
+++ b/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c
@@ -229,6 +229,24 @@
 		BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
 			    offsetof(struct bpf_sk_lookup, local_port)),
 
+		/* 1-byte read from ingress_ifindex field */
+		BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+			    offsetof(struct bpf_sk_lookup, ingress_ifindex)),
+		BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+			    offsetof(struct bpf_sk_lookup, ingress_ifindex) + 1),
+		BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+			    offsetof(struct bpf_sk_lookup, ingress_ifindex) + 2),
+		BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+			    offsetof(struct bpf_sk_lookup, ingress_ifindex) + 3),
+		/* 2-byte read from ingress_ifindex field */
+		BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+			    offsetof(struct bpf_sk_lookup, ingress_ifindex)),
+		BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+			    offsetof(struct bpf_sk_lookup, ingress_ifindex) + 2),
+		/* 4-byte read from ingress_ifindex field */
+		BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+			    offsetof(struct bpf_sk_lookup, ingress_ifindex)),
+
 		/* 8-byte read from sk field */
 		BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
 			    offsetof(struct bpf_sk_lookup, sk)),
@@ -351,6 +369,20 @@
 	.expected_attach_type = BPF_SK_LOOKUP,
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
+{
+	"invalid 8-byte read from bpf_sk_lookup ingress_ifindex field",
+	.insns = {
+		BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+			    offsetof(struct bpf_sk_lookup, ingress_ifindex)),
+		BPF_MOV32_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.errstr = "invalid bpf_context access",
+	.result = REJECT,
+	.prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+	.expected_attach_type = BPF_SK_LOOKUP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
 /* invalid 1,2,4-byte reads from 8-byte fields in bpf_sk_lookup */
 {
 	"invalid 4-byte read from bpf_sk_lookup sk field",
-- 
cgit v1.2.3


From a6ca71583137300f207343d5d950cb1c365ab911 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 10 Nov 2021 21:17:57 -0800
Subject: libbpf: Add ability to get/set per-program load flags

Add bpf_program__flags() API to retrieve prog_flags that will be (or
were) supplied to BPF_PROG_LOAD command.

Also add bpf_program__set_extra_flags() API to allow to set *extra*
flags, in addition to those determined by program's SEC() definition.
Such flags are logically OR'ed with libbpf-derived flags.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211111051758.92283-2-andrii@kernel.org
---
 tools/lib/bpf/libbpf.c   | 14 ++++++++++++++
 tools/lib/bpf/libbpf.h   |  3 +++
 tools/lib/bpf/libbpf.map |  2 ++
 3 files changed, 19 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index d869ebee1e27..a823b5ed705b 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -8262,6 +8262,20 @@ void bpf_program__set_expected_attach_type(struct bpf_program *prog,
 	prog->expected_attach_type = type;
 }
 
+__u32 bpf_program__flags(const struct bpf_program *prog)
+{
+	return prog->prog_flags;
+}
+
+int bpf_program__set_extra_flags(struct bpf_program *prog, __u32 extra_flags)
+{
+	if (prog->obj->loaded)
+		return libbpf_err(-EBUSY);
+
+	prog->prog_flags |= extra_flags;
+	return 0;
+}
+
 #define SEC_DEF(sec_pfx, ptype, atype, flags, ...) {			    \
 	.sec = sec_pfx,							    \
 	.prog_type = BPF_PROG_TYPE_##ptype,				    \
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 039058763173..f69512ae8505 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -493,6 +493,9 @@ LIBBPF_API void
 bpf_program__set_expected_attach_type(struct bpf_program *prog,
 				      enum bpf_attach_type type);
 
+LIBBPF_API __u32 bpf_program__flags(const struct bpf_program *prog);
+LIBBPF_API int bpf_program__set_extra_flags(struct bpf_program *prog, __u32 extra_flags);
+
 LIBBPF_API int
 bpf_program__set_attach_target(struct bpf_program *prog, int attach_prog_fd,
 			       const char *attach_func_name);
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index b895861a13c0..256d5d4be951 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -397,8 +397,10 @@ LIBBPF_0.6.0 {
 		bpf_object__prev_program;
 		bpf_prog_load_deprecated;
 		bpf_prog_load;
+		bpf_program__flags;
 		bpf_program__insn_cnt;
 		bpf_program__insns;
+		bpf_program__set_extra_flags;
 		btf__add_btf;
 		btf__add_decl_tag;
 		btf__raw_data;
-- 
cgit v1.2.3


From 50dee7078b66d881c62f6177844d625f7ead6003 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 10 Nov 2021 21:17:58 -0800
Subject: selftests/bpf: Fix bpf_prog_test_load() logic to pass extra log level

After recent refactoring bpf_prog_test_load(), used across multiple
selftests, lost ability to specify extra log_level 1 or 2 (for -vv and
-vvv, respectively). Fix that problem by using bpf_object__load_xattr()
API that supports extra log_level flags. Also restore
BPF_F_TEST_RND_HI32 prog_flags by utilizing new bpf_program__set_extra_flags()
API.

Fixes: f87c1930ac29 ("selftests/bpf: Merge test_stub.c into testing_helpers.c")
Reported-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211111051758.92283-3-andrii@kernel.org
---
 tools/testing/selftests/bpf/testing_helpers.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c
index ef61d43adfe4..52c2f24e0898 100644
--- a/tools/testing/selftests/bpf/testing_helpers.c
+++ b/tools/testing/selftests/bpf/testing_helpers.c
@@ -88,6 +88,7 @@ int extra_prog_load_log_flags = 0;
 int bpf_prog_test_load(const char *file, enum bpf_prog_type type,
 		       struct bpf_object **pobj, int *prog_fd)
 {
+	struct bpf_object_load_attr attr = {};
 	struct bpf_object *obj;
 	struct bpf_program *prog;
 	int err;
@@ -105,7 +106,11 @@ int bpf_prog_test_load(const char *file, enum bpf_prog_type type,
 	if (type != BPF_PROG_TYPE_UNSPEC)
 		bpf_program__set_type(prog, type);
 
-	err = bpf_object__load(obj);
+	bpf_program__set_extra_flags(prog, BPF_F_TEST_RND_HI32);
+
+	attr.obj = obj;
+	attr.log_level = extra_prog_load_log_flags;
+	err = bpf_object__load_xattr(&attr);
 	if (err)
 		goto err_out;
 
-- 
cgit v1.2.3


From 6501182c08f76e4118177b42614174547e1bd149 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 10 Nov 2021 21:36:16 -0800
Subject: bpftool: Normalize compile rules to specify output file last

When dealing with verbose Makefile output, it's extremely confusing when
compiler invocation commands don't specify -o <output.o> as the last
argument. Normalize bpftool's Makefile to do just that, as most other
BPF-related Makefiles are already doing that.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211111053624.190580-2-andrii@kernel.org
---
 tools/bpf/bpftool/Makefile | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index 622568c7a9b8..31dfef6a4121 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -177,7 +177,8 @@ $(OUTPUT)%.bpf.o: skeleton/%.bpf.c $(OUTPUT)vmlinux.h $(LIBBPF)
 		-I$(if $(OUTPUT),$(OUTPUT),.) \
 		-I$(srctree)/tools/include/uapi/ \
 		-I$(LIBBPF_INCLUDE) \
-		-g -O2 -Wall -target bpf -c $< -o $@ && $(LLVM_STRIP) -g $@
+		-g -O2 -Wall -target bpf -c $< -o $@
+	$(Q)$(LLVM_STRIP) -g $@
 
 $(OUTPUT)%.skel.h: $(OUTPUT)%.bpf.o $(BPFTOOL_BOOTSTRAP)
 	$(QUIET_GEN)$(BPFTOOL_BOOTSTRAP) gen skeleton $< > $@
@@ -192,10 +193,10 @@ endif
 CFLAGS += $(if $(BUILD_BPF_SKELS),,-DBPFTOOL_WITHOUT_SKELETONS)
 
 $(BOOTSTRAP_OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c
-	$(QUIET_CC)$(HOSTCC) $(CFLAGS) -c -MMD -o $@ $<
+	$(QUIET_CC)$(HOSTCC) $(CFLAGS) -c -MMD $< -o $@
 
 $(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c
-	$(QUIET_CC)$(CC) $(CFLAGS) -c -MMD -o $@ $<
+	$(QUIET_CC)$(CC) $(CFLAGS) -c -MMD $< -o $@
 
 $(OUTPUT)feature.o:
 ifneq ($(feature-zlib), 1)
@@ -203,17 +204,16 @@ ifneq ($(feature-zlib), 1)
 endif
 
 $(BPFTOOL_BOOTSTRAP): $(BOOTSTRAP_OBJS) $(LIBBPF_BOOTSTRAP)
-	$(QUIET_LINK)$(HOSTCC) $(CFLAGS) $(LDFLAGS) -o $@ $(BOOTSTRAP_OBJS) \
-		$(LIBS_BOOTSTRAP)
+	$(QUIET_LINK)$(HOSTCC) $(CFLAGS) $(LDFLAGS) $(BOOTSTRAP_OBJS) $(LIBS_BOOTSTRAP) -o $@
 
 $(OUTPUT)bpftool: $(OBJS) $(LIBBPF)
-	$(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(OBJS) $(LIBS)
+	$(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $(OBJS) $(LIBS) -o $@
 
 $(BOOTSTRAP_OUTPUT)%.o: %.c $(LIBBPF_INTERNAL_HDRS) | $(BOOTSTRAP_OUTPUT)
-	$(QUIET_CC)$(HOSTCC) $(CFLAGS) -c -MMD -o $@ $<
+	$(QUIET_CC)$(HOSTCC) $(CFLAGS) -c -MMD $< -o $@
 
 $(OUTPUT)%.o: %.c
-	$(QUIET_CC)$(CC) $(CFLAGS) -c -MMD -o $@ $<
+	$(QUIET_CC)$(CC) $(CFLAGS) -c -MMD $< -o $@
 
 feature-detect-clean:
 	$(call QUIET_CLEAN, feature-detect)
-- 
cgit v1.2.3


From de29e6bbb9ee674d639cd42fe565f28757208614 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 10 Nov 2021 21:36:17 -0800
Subject: selftests/bpf: Minor cleanups and normalization of Makefile

Few clean ups and single-line simplifications. Also split CLEAN command
into multiple $(RM) invocations as it gets dangerously close to too long
argument list. Make sure that -o <output.o> is used always as the last
argument for saner verbose make output.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211111053624.190580-3-andrii@kernel.org
---
 tools/testing/selftests/bpf/Makefile | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 0468ea57650d..0470802c907c 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -45,10 +45,8 @@ ifneq ($(BPF_GCC),)
 TEST_GEN_PROGS += test_progs-bpf_gcc
 endif
 
-TEST_GEN_FILES = test_lwt_ip_encap.o \
-	test_tc_edt.o
-TEST_FILES = xsk_prereqs.sh \
-	$(wildcard progs/btf_dump_test_case_*.c)
+TEST_GEN_FILES = test_lwt_ip_encap.o test_tc_edt.o
+TEST_FILES = xsk_prereqs.sh $(wildcard progs/btf_dump_test_case_*.c)
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
@@ -107,7 +105,10 @@ endif
 OVERRIDE_TARGETS := 1
 override define CLEAN
 	$(call msg,CLEAN)
-	$(Q)$(RM) -r $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(EXTRA_CLEAN)
+	$(Q)$(RM) -r $(TEST_GEN_PROGS)
+	$(Q)$(RM) -r $(TEST_GEN_PROGS_EXTENDED)
+	$(Q)$(RM) -r $(TEST_GEN_FILES)
+	$(Q)$(RM) -r $(EXTRA_CLEAN)
 	$(Q)$(MAKE) -C bpf_testmod clean
 	$(Q)$(MAKE) docs-clean
 endef
@@ -169,7 +170,7 @@ $(OUTPUT)/%:%.c
 
 $(OUTPUT)/urandom_read: urandom_read.c
 	$(call msg,BINARY,,$@)
-	$(Q)$(CC) $(LDFLAGS) -o $@ $< $(LDLIBS) -Wl,--build-id=sha1
+	$(Q)$(CC) $(LDFLAGS) $< $(LDLIBS) -Wl,--build-id=sha1 -o $@
 
 $(OUTPUT)/bpf_testmod.ko: $(VMLINUX_BTF) $(wildcard bpf_testmod/Makefile bpf_testmod/*.[ch])
 	$(call msg,MOD,,$@)
@@ -232,16 +233,16 @@ docs-clean:
 	            prefix= OUTPUT=$(OUTPUT)/ DESTDIR=$(OUTPUT)/ $@
 
 $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)		       \
-	   ../../../include/uapi/linux/bpf.h                                   \
+	   $(APIDIR)/linux/bpf.h					       \
 	   | $(BUILD_DIR)/libbpf
 	$(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/ \
 		    EXTRA_CFLAGS='-g -O0'				       \
 		    DESTDIR=$(SCRATCH_DIR) prefix= all install_headers
 
 ifneq ($(BPFOBJ),$(HOST_BPFOBJ))
-$(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)                \
-	   ../../../include/uapi/linux/bpf.h                                   \
-	   | $(HOST_BUILD_DIR)/libbpf
+$(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)		       \
+		$(APIDIR)/linux/bpf.h					       \
+		| $(HOST_BUILD_DIR)/libbpf
 	$(Q)$(MAKE) $(submake_extras) -C $(BPFDIR)                             \
 		    EXTRA_CFLAGS='-g -O0'				       \
 		    OUTPUT=$(HOST_BUILD_DIR)/libbpf/ CC=$(HOSTCC) LD=$(HOSTLD) \
@@ -305,12 +306,12 @@ $(OUTPUT)/flow_dissector_load.o: flow_dissector_load.h
 # $3 - CFLAGS
 define CLANG_BPF_BUILD_RULE
 	$(call msg,CLNG-BPF,$(TRUNNER_BINARY),$2)
-	$(Q)$(CLANG) $3 -O2 -target bpf -c $1 -o $2 -mcpu=v3
+	$(Q)$(CLANG) $3 -O2 -target bpf -c $1 -mcpu=v3 -o $2
 endef
 # Similar to CLANG_BPF_BUILD_RULE, but with disabled alu32
 define CLANG_NOALU32_BPF_BUILD_RULE
 	$(call msg,CLNG-BPF,$(TRUNNER_BINARY),$2)
-	$(Q)$(CLANG) $3 -O2 -target bpf -c $1 -o $2 -mcpu=v2
+	$(Q)$(CLANG) $3 -O2 -target bpf -c $1 -mcpu=v2 -o $2
 endef
 # Build BPF object using GCC
 define GCC_BPF_BUILD_RULE
@@ -472,13 +473,12 @@ TRUNNER_TESTS_DIR := prog_tests
 TRUNNER_BPF_PROGS_DIR := progs
 TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c	\
 			 network_helpers.c testing_helpers.c		\
-			 btf_helpers.c	flow_dissector_load.h
+			 btf_helpers.c flow_dissector_load.h
 TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko	\
 		       ima_setup.sh					\
 		       $(wildcard progs/btf_dump_test_case_*.c)
 TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE
-TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS)
-TRUNNER_BPF_CFLAGS += -DENABLE_ATOMICS_TESTS
+TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) -DENABLE_ATOMICS_TESTS
 $(eval $(call DEFINE_TEST_RUNNER,test_progs))
 
 # Define test_progs-no_alu32 test runner.
@@ -540,7 +540,7 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o $(OUTPUT)/testing_helpers.o \
 		 $(OUTPUT)/bench_ringbufs.o \
 		 $(OUTPUT)/bench_bloom_filter_map.o
 	$(call msg,BINARY,,$@)
-	$(Q)$(CC) $(LDFLAGS) -o $@ $(filter %.a %.o,$^) $(LDLIBS)
+	$(Q)$(CC) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@
 
 EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)	\
 	prog_tests/tests.h map_tests/tests.h verifier/tests.h		\
-- 
cgit v1.2.3


From 957d350a8b94133d114a9b1ac3e79f1f77100681 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 10 Nov 2021 21:36:18 -0800
Subject: libbpf: Turn btf_dedup_opts into OPTS-based struct

btf__dedup() and struct btf_dedup_opts were added before we figured out
OPTS mechanism. As such, btf_dedup_opts is non-extensible without
breaking an ABI and potentially crashing user application.

Unfortunately, btf__dedup() and btf_dedup_opts are short and succinct
names that would be great to preserve and use going forward. So we use
___libbpf_override() macro approach, used previously for bpf_prog_load()
API, to define a new btf__dedup() variant that accepts only struct btf *
and struct btf_dedup_opts * arguments, and rename the old btf__dedup()
implementation into btf__dedup_deprecated(). This keeps both source and
binary compatibility with old and new applications.

The biggest problem was struct btf_dedup_opts, which wasn't OPTS-based,
and as such doesn't have `size_t sz;` as a first field. But btf__dedup()
is a pretty rarely used API and I believe that the only currently known
users (besides selftests) are libbpf's own bpf_linker and pahole.
Neither use case actually uses options and just passes NULL. So instead
of doing extra hacks, just rewrite struct btf_dedup_opts into OPTS-based
one, move btf_ext argument into those opts (only bpf_linker needs to
dedup btf_ext, so it's not a typical thing to specify), and drop never
used `dont_resolve_fwds` option (it was never used anywhere, AFAIK, it
makes BTF dedup much less useful and efficient).

Just in case, for old implementation, btf__dedup_deprecated(), detect
non-NULL options and error out with helpful message, to help users
migrate, if there are any user playing with btf__dedup().

The last remaining piece is dedup_table_size, which is another
anachronism from very early days of BTF dedup. Since then it has been
reduced to the only valid value, 1, to request forced hash collisions.
This is only used during testing. So instead introduce a bool flag to
force collisions explicitly.

This patch also adapts selftests to new btf__dedup() and btf_dedup_opts
use to avoid selftests breakage.

  [0] Closes: https://github.com/libbpf/libbpf/issues/281

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211111053624.190580-4-andrii@kernel.org
---
 tools/lib/bpf/btf.c                                | 46 +++++++++++++---------
 tools/lib/bpf/btf.h                                | 20 ++++++++--
 tools/lib/bpf/libbpf.map                           |  2 +
 tools/lib/bpf/linker.c                             |  4 +-
 tools/testing/selftests/bpf/prog_tests/btf.c       | 46 ++++------------------
 .../selftests/bpf/prog_tests/btf_dedup_split.c     |  6 +--
 6 files changed, 58 insertions(+), 66 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 7e4c5586bd87..fcec27622e7a 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -2846,8 +2846,7 @@ __u32 btf_ext__line_info_rec_size(const struct btf_ext *btf_ext)
 
 struct btf_dedup;
 
-static struct btf_dedup *btf_dedup_new(struct btf *btf, struct btf_ext *btf_ext,
-				       const struct btf_dedup_opts *opts);
+static struct btf_dedup *btf_dedup_new(struct btf *btf, const struct btf_dedup_opts *opts);
 static void btf_dedup_free(struct btf_dedup *d);
 static int btf_dedup_prep(struct btf_dedup *d);
 static int btf_dedup_strings(struct btf_dedup *d);
@@ -2994,12 +2993,17 @@ static int btf_dedup_remap_types(struct btf_dedup *d);
  * deduplicating structs/unions is described in greater details in comments for
  * `btf_dedup_is_equiv` function.
  */
-int btf__dedup(struct btf *btf, struct btf_ext *btf_ext,
-	       const struct btf_dedup_opts *opts)
+
+DEFAULT_VERSION(btf__dedup_v0_6_0, btf__dedup, LIBBPF_0.6.0)
+int btf__dedup_v0_6_0(struct btf *btf, const struct btf_dedup_opts *opts)
 {
-	struct btf_dedup *d = btf_dedup_new(btf, btf_ext, opts);
+	struct btf_dedup *d;
 	int err;
 
+	if (!OPTS_VALID(opts, btf_dedup_opts))
+		return libbpf_err(-EINVAL);
+
+	d = btf_dedup_new(btf, opts);
 	if (IS_ERR(d)) {
 		pr_debug("btf_dedup_new failed: %ld", PTR_ERR(d));
 		return libbpf_err(-EINVAL);
@@ -3051,6 +3055,19 @@ done:
 	return libbpf_err(err);
 }
 
+COMPAT_VERSION(bpf__dedup_deprecated, btf__dedup, LIBBPF_0.0.2)
+int btf__dedup_deprecated(struct btf *btf, struct btf_ext *btf_ext, const void *unused_opts)
+{
+	LIBBPF_OPTS(btf_dedup_opts, opts, .btf_ext = btf_ext);
+
+	if (unused_opts) {
+		pr_warn("please use new version of btf__dedup() that supports options\n");
+		return libbpf_err(-ENOTSUP);
+	}
+
+	return btf__dedup(btf, &opts);
+}
+
 #define BTF_UNPROCESSED_ID ((__u32)-1)
 #define BTF_IN_PROGRESS_ID ((__u32)-2)
 
@@ -3163,8 +3180,7 @@ static bool btf_dedup_equal_fn(const void *k1, const void *k2, void *ctx)
 	return k1 == k2;
 }
 
-static struct btf_dedup *btf_dedup_new(struct btf *btf, struct btf_ext *btf_ext,
-				       const struct btf_dedup_opts *opts)
+static struct btf_dedup *btf_dedup_new(struct btf *btf, const struct btf_dedup_opts *opts)
 {
 	struct btf_dedup *d = calloc(1, sizeof(struct btf_dedup));
 	hashmap_hash_fn hash_fn = btf_dedup_identity_hash_fn;
@@ -3173,13 +3189,11 @@ static struct btf_dedup *btf_dedup_new(struct btf *btf, struct btf_ext *btf_ext,
 	if (!d)
 		return ERR_PTR(-ENOMEM);
 
-	d->opts.dont_resolve_fwds = opts && opts->dont_resolve_fwds;
-	/* dedup_table_size is now used only to force collisions in tests */
-	if (opts && opts->dedup_table_size == 1)
+	if (OPTS_GET(opts, force_collisions, false))
 		hash_fn = btf_dedup_collision_hash_fn;
 
 	d->btf = btf;
-	d->btf_ext = btf_ext;
+	d->btf_ext = OPTS_GET(opts, btf_ext, NULL);
 
 	d->dedup_table = hashmap__new(hash_fn, btf_dedup_equal_fn, NULL);
 	if (IS_ERR(d->dedup_table)) {
@@ -3708,8 +3722,6 @@ static int btf_dedup_prim_type(struct btf_dedup *d, __u32 type_id)
 				new_id = cand_id;
 				break;
 			}
-			if (d->opts.dont_resolve_fwds)
-				continue;
 			if (btf_compat_enum(t, cand)) {
 				if (btf_is_enum_fwd(t)) {
 					/* resolve fwd to full enum */
@@ -3952,8 +3964,7 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id,
 		return 0;
 
 	/* FWD <--> STRUCT/UNION equivalence check, if enabled */
-	if (!d->opts.dont_resolve_fwds
-	    && (cand_kind == BTF_KIND_FWD || canon_kind == BTF_KIND_FWD)
+	if ((cand_kind == BTF_KIND_FWD || canon_kind == BTF_KIND_FWD)
 	    && cand_kind != canon_kind) {
 		__u16 real_kind;
 		__u16 fwd_kind;
@@ -3979,10 +3990,7 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id,
 		return btf_equal_int_tag(cand_type, canon_type);
 
 	case BTF_KIND_ENUM:
-		if (d->opts.dont_resolve_fwds)
-			return btf_equal_enum(cand_type, canon_type);
-		else
-			return btf_compat_enum(cand_type, canon_type);
+		return btf_compat_enum(cand_type, canon_type);
 
 	case BTF_KIND_FWD:
 	case BTF_KIND_FLOAT:
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index bc005ba3ceec..6aae4f62ee0b 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -245,12 +245,24 @@ LIBBPF_API int btf__add_decl_tag(struct btf *btf, const char *value, int ref_typ
 			    int component_idx);
 
 struct btf_dedup_opts {
-	unsigned int dedup_table_size;
-	bool dont_resolve_fwds;
+	size_t sz;
+	/* optional .BTF.ext info to dedup along the main BTF info */
+	struct btf_ext *btf_ext;
+	/* force hash collisions (used for testing) */
+	bool force_collisions;
+	size_t :0;
 };
+#define btf_dedup_opts__last_field force_collisions
+
+LIBBPF_API int btf__dedup(struct btf *btf, const struct btf_dedup_opts *opts);
+
+LIBBPF_API int btf__dedup_v0_6_0(struct btf *btf, const struct btf_dedup_opts *opts);
 
-LIBBPF_API int btf__dedup(struct btf *btf, struct btf_ext *btf_ext,
-			  const struct btf_dedup_opts *opts);
+LIBBPF_DEPRECATED_SINCE(0, 7, "use btf__dedup() instead")
+LIBBPF_API int btf__dedup_deprecated(struct btf *btf, struct btf_ext *btf_ext, const void *opts);
+#define btf__dedup(...) ___libbpf_overload(___btf_dedup, __VA_ARGS__)
+#define ___btf_dedup3(btf, btf_ext, opts) btf__dedup_deprecated(btf, btf_ext, opts)
+#define ___btf_dedup2(btf, opts) btf__dedup(btf, opts)
 
 struct btf_dump;
 
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 256d5d4be951..b8c8d14621d2 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -403,6 +403,8 @@ LIBBPF_0.6.0 {
 		bpf_program__set_extra_flags;
 		btf__add_btf;
 		btf__add_decl_tag;
+		btf__dedup;
+		btf__dedup_deprecated;
 		btf__raw_data;
 		btf__type_cnt;
 } LIBBPF_0.5.0;
diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c
index f677dccdeae4..594b206fa674 100644
--- a/tools/lib/bpf/linker.c
+++ b/tools/lib/bpf/linker.c
@@ -2650,6 +2650,7 @@ static int emit_elf_data_sec(struct bpf_linker *linker, const char *sec_name,
 
 static int finalize_btf(struct bpf_linker *linker)
 {
+	LIBBPF_OPTS(btf_dedup_opts, opts);
 	struct btf *btf = linker->btf;
 	const void *raw_data;
 	int i, j, id, err;
@@ -2686,7 +2687,8 @@ static int finalize_btf(struct bpf_linker *linker)
 		return err;
 	}
 
-	err = btf__dedup(linker->btf, linker->btf_ext, NULL);
+	opts.btf_ext = linker->btf_ext;
+	err = btf__dedup(linker->btf, &opts);
 	if (err) {
 		pr_warn("BTF dedup failed: %d\n", err);
 		return err;
diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c
index ebd1aa4d09d6..1e8b36d74df2 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf.c
@@ -6627,7 +6627,7 @@ struct btf_dedup_test {
 	struct btf_dedup_opts opts;
 };
 
-const struct btf_dedup_test dedup_tests[] = {
+static struct btf_dedup_test dedup_tests[] = {
 
 {
 	.descr = "dedup: unused strings filtering",
@@ -6647,9 +6647,6 @@ const struct btf_dedup_test dedup_tests[] = {
 		},
 		BTF_STR_SEC("\0int\0long"),
 	},
-	.opts = {
-		.dont_resolve_fwds = false,
-	},
 },
 {
 	.descr = "dedup: strings deduplication",
@@ -6672,9 +6669,6 @@ const struct btf_dedup_test dedup_tests[] = {
 		},
 		BTF_STR_SEC("\0int\0long int"),
 	},
-	.opts = {
-		.dont_resolve_fwds = false,
-	},
 },
 {
 	.descr = "dedup: struct example #1",
@@ -6755,9 +6749,6 @@ const struct btf_dedup_test dedup_tests[] = {
 		},
 		BTF_STR_SEC("\0a\0b\0c\0d\0int\0float\0next\0s"),
 	},
-	.opts = {
-		.dont_resolve_fwds = false,
-	},
 },
 {
 	.descr = "dedup: struct <-> fwd resolution w/ hash collision",
@@ -6800,8 +6791,7 @@ const struct btf_dedup_test dedup_tests[] = {
 		BTF_STR_SEC("\0s\0x"),
 	},
 	.opts = {
-		.dont_resolve_fwds = false,
-		.dedup_table_size = 1, /* force hash collisions */
+		.force_collisions = true, /* force hash collisions */
 	},
 },
 {
@@ -6847,8 +6837,7 @@ const struct btf_dedup_test dedup_tests[] = {
 		BTF_STR_SEC("\0s\0x"),
 	},
 	.opts = {
-		.dont_resolve_fwds = false,
-		.dedup_table_size = 1, /* force hash collisions */
+		.force_collisions = true, /* force hash collisions */
 	},
 },
 {
@@ -6911,9 +6900,6 @@ const struct btf_dedup_test dedup_tests[] = {
 		},
 		BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M\0N\0O\0P\0Q"),
 	},
-	.opts = {
-		.dont_resolve_fwds = false,
-	},
 },
 {
 	.descr = "dedup: no int/float duplicates",
@@ -6965,9 +6951,6 @@ const struct btf_dedup_test dedup_tests[] = {
 		},
 		BTF_STR_SEC("\0int\0some other int\0float"),
 	},
-	.opts = {
-		.dont_resolve_fwds = false,
-	},
 },
 {
 	.descr = "dedup: enum fwd resolution",
@@ -7009,9 +6992,6 @@ const struct btf_dedup_test dedup_tests[] = {
 		},
 		BTF_STR_SEC("\0e1\0e1_val\0e2\0e2_val"),
 	},
-	.opts = {
-		.dont_resolve_fwds = false,
-	},
 },
 {
 	.descr = "dedup: datasec and vars pass-through",
@@ -7054,8 +7034,7 @@ const struct btf_dedup_test dedup_tests[] = {
 		BTF_STR_SEC("\0.bss\0t"),
 	},
 	.opts = {
-		.dont_resolve_fwds = false,
-		.dedup_table_size = 1
+		.force_collisions = true
 	},
 },
 {
@@ -7099,9 +7078,6 @@ const struct btf_dedup_test dedup_tests[] = {
 		},
 		BTF_STR_SEC("\0t\0a1\0a2\0f\0tag"),
 	},
-	.opts = {
-		.dont_resolve_fwds = false,
-	},
 },
 {
 	.descr = "dedup: func/func_param tags",
@@ -7152,9 +7128,6 @@ const struct btf_dedup_test dedup_tests[] = {
 		},
 		BTF_STR_SEC("\0a1\0a2\0f\0tag1\0tag2\0tag3"),
 	},
-	.opts = {
-		.dont_resolve_fwds = false,
-	},
 },
 {
 	.descr = "dedup: struct/struct_member tags",
@@ -7200,9 +7173,6 @@ const struct btf_dedup_test dedup_tests[] = {
 		},
 		BTF_STR_SEC("\0t\0m1\0m2\0tag1\0tag2\0tag3"),
 	},
-	.opts = {
-		.dont_resolve_fwds = false,
-	},
 },
 {
 	.descr = "dedup: typedef tags",
@@ -7233,9 +7203,6 @@ const struct btf_dedup_test dedup_tests[] = {
 		},
 		BTF_STR_SEC("\0t\0tag1\0tag2\0tag3"),
 	},
-	.opts = {
-		.dont_resolve_fwds = false,
-	},
 },
 
 };
@@ -7293,7 +7260,7 @@ static void dump_btf_strings(const char *strs, __u32 len)
 
 static void do_test_dedup(unsigned int test_num)
 {
-	const struct btf_dedup_test *test = &dedup_tests[test_num - 1];
+	struct btf_dedup_test *test = &dedup_tests[test_num - 1];
 	__u32 test_nr_types, expect_nr_types, test_btf_size, expect_btf_size;
 	const struct btf_header *test_hdr, *expect_hdr;
 	struct btf *test_btf = NULL, *expect_btf = NULL;
@@ -7337,7 +7304,8 @@ static void do_test_dedup(unsigned int test_num)
 		goto done;
 	}
 
-	err = btf__dedup(test_btf, NULL, &test->opts);
+	test->opts.sz = sizeof(test->opts);
+	err = btf__dedup(test_btf, &test->opts);
 	if (CHECK(err, "btf_dedup failed errno:%d", err)) {
 		err = -1;
 		goto done;
diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c b/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c
index 64554fd33547..9d3b8d7a1537 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c
@@ -92,7 +92,7 @@ struct s2 {\n\
 	int *f3;\n\
 };\n\n", "c_dump");
 
-	err = btf__dedup(btf2, NULL, NULL);
+	err = btf__dedup(btf2, NULL);
 	if (!ASSERT_OK(err, "btf_dedup"))
 		goto cleanup;
 
@@ -186,7 +186,7 @@ static void test_split_fwd_resolve() {
 		"\t'f1' type_id=7 bits_offset=0\n"
 		"\t'f2' type_id=9 bits_offset=64");
 
-	err = btf__dedup(btf2, NULL, NULL);
+	err = btf__dedup(btf2, NULL);
 	if (!ASSERT_OK(err, "btf_dedup"))
 		goto cleanup;
 
@@ -283,7 +283,7 @@ static void test_split_struct_duped() {
 		"[13] STRUCT 's3' size=8 vlen=1\n"
 		"\t'f1' type_id=12 bits_offset=0");
 
-	err = btf__dedup(btf2, NULL, NULL);
+	err = btf__dedup(btf2, NULL);
 	if (!ASSERT_OK(err, "btf_dedup"))
 		goto cleanup;
 
-- 
cgit v1.2.3


From 6084f5dc928f2ada4331ba9eda65542e94d86bc6 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 10 Nov 2021 21:36:19 -0800
Subject: libbpf: Ensure btf_dump__new() and btf_dump_opts are future-proof

Change btf_dump__new() and corresponding struct btf_dump_ops structure
to be extensible by using OPTS "framework" ([0]). Given we don't change
the names, we use a similar approach as with bpf_prog_load(), but this
time we ended up with two APIs with the same name and same number of
arguments, so overloading based on number of arguments with
___libbpf_override() doesn't work.

Instead, use "overloading" based on types. In this particular case,
print callback has to be specified, so we detect which argument is
a callback. If it's 4th (last) argument, old implementation of API is
used by user code. If not, it must be 2nd, and thus new implementation
is selected. The rest is handled by the same symbol versioning approach.

btf_ext argument is dropped as it was never used and isn't necessary
either. If in the future we'll need btf_ext, that will be added into
OPTS-based struct btf_dump_opts.

struct btf_dump_opts is reused for both old API and new APIs. ctx field
is marked deprecated in v0.7+ and it's put at the same memory location
as OPTS's sz field. Any user of new-style btf_dump__new() will have to
set sz field and doesn't/shouldn't use ctx, as ctx is now passed along
the callback as mandatory input argument, following the other APIs in
libbpf that accept callbacks consistently.

Again, this is quite ugly in implementation, but is done in the name of
backwards compatibility and uniform and extensible future APIs (at the
same time, sigh). And it will be gone in libbpf 1.0.

  [0] Closes: https://github.com/libbpf/libbpf/issues/283

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211111053624.190580-5-andrii@kernel.org
---
 tools/lib/bpf/btf.h      | 51 ++++++++++++++++++++++++++++++++++++++++++++----
 tools/lib/bpf/btf_dump.c | 31 ++++++++++++++++++++---------
 tools/lib/bpf/libbpf.map |  2 ++
 3 files changed, 71 insertions(+), 13 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index 6aae4f62ee0b..45310c65e865 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -267,15 +267,58 @@ LIBBPF_API int btf__dedup_deprecated(struct btf *btf, struct btf_ext *btf_ext, c
 struct btf_dump;
 
 struct btf_dump_opts {
-	void *ctx;
+	union {
+		size_t sz;
+		void *ctx; /* DEPRECATED: will be gone in v1.0 */
+	};
 };
 
 typedef void (*btf_dump_printf_fn_t)(void *ctx, const char *fmt, va_list args);
 
 LIBBPF_API struct btf_dump *btf_dump__new(const struct btf *btf,
-					  const struct btf_ext *btf_ext,
-					  const struct btf_dump_opts *opts,
-					  btf_dump_printf_fn_t printf_fn);
+					  btf_dump_printf_fn_t printf_fn,
+					  void *ctx,
+					  const struct btf_dump_opts *opts);
+
+LIBBPF_API struct btf_dump *btf_dump__new_v0_6_0(const struct btf *btf,
+						 btf_dump_printf_fn_t printf_fn,
+						 void *ctx,
+						 const struct btf_dump_opts *opts);
+
+LIBBPF_API struct btf_dump *btf_dump__new_deprecated(const struct btf *btf,
+						     const struct btf_ext *btf_ext,
+						     const struct btf_dump_opts *opts,
+						     btf_dump_printf_fn_t printf_fn);
+
+/* Choose either btf_dump__new() or btf_dump__new_deprecated() based on the
+ * type of 4th argument. If it's btf_dump's print callback, use deprecated
+ * API; otherwise, choose the new btf_dump__new(). ___libbpf_override()
+ * doesn't work here because both variants have 4 input arguments.
+ *
+ * (void *) casts are necessary to avoid compilation warnings about type
+ * mismatches, because even though __builtin_choose_expr() only ever evaluates
+ * one side the other side still has to satisfy type constraints (this is
+ * compiler implementation limitation which might be lifted eventually,
+ * according to the documentation). So passing struct btf_ext in place of
+ * btf_dump_printf_fn_t would be generating compilation warning.  Casting to
+ * void * avoids this issue.
+ *
+ * Also, two type compatibility checks for a function and function pointer are
+ * required because passing function reference into btf_dump__new() as
+ * btf_dump__new(..., my_callback, ...) and as btf_dump__new(...,
+ * &my_callback, ...) (not explicit ampersand in the latter case) actually
+ * differs as far as __builtin_types_compatible_p() is concerned. Thus two
+ * checks are combined to detect callback argument.
+ *
+ * The rest works just like in case of ___libbpf_override() usage with symbol
+ * versioning.
+ */
+#define btf_dump__new(a1, a2, a3, a4) __builtin_choose_expr(				\
+	__builtin_types_compatible_p(typeof(a4), btf_dump_printf_fn_t) ||		\
+	__builtin_types_compatible_p(typeof(a4), void(void *, const char *, va_list)),	\
+	btf_dump__new_deprecated((void *)a1, (void *)a2, (void *)a3, (void *)a4),	\
+	btf_dump__new((void *)a1, (void *)a2, (void *)a3, (void *)a4))
+
 LIBBPF_API void btf_dump__free(struct btf_dump *d);
 
 LIBBPF_API int btf_dump__dump_type(struct btf_dump *d, __u32 id);
diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c
index 17db62b5002e..b8cd7e4f557a 100644
--- a/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c
@@ -77,9 +77,8 @@ struct btf_dump_data {
 
 struct btf_dump {
 	const struct btf *btf;
-	const struct btf_ext *btf_ext;
 	btf_dump_printf_fn_t printf_fn;
-	struct btf_dump_opts opts;
+	void *cb_ctx;
 	int ptr_sz;
 	bool strip_mods;
 	bool skip_anon_defs;
@@ -138,29 +137,32 @@ static void btf_dump_printf(const struct btf_dump *d, const char *fmt, ...)
 	va_list args;
 
 	va_start(args, fmt);
-	d->printf_fn(d->opts.ctx, fmt, args);
+	d->printf_fn(d->cb_ctx, fmt, args);
 	va_end(args);
 }
 
 static int btf_dump_mark_referenced(struct btf_dump *d);
 static int btf_dump_resize(struct btf_dump *d);
 
-struct btf_dump *btf_dump__new(const struct btf *btf,
-			       const struct btf_ext *btf_ext,
-			       const struct btf_dump_opts *opts,
-			       btf_dump_printf_fn_t printf_fn)
+DEFAULT_VERSION(btf_dump__new_v0_6_0, btf_dump__new, LIBBPF_0.6.0)
+struct btf_dump *btf_dump__new_v0_6_0(const struct btf *btf,
+				      btf_dump_printf_fn_t printf_fn,
+				      void *ctx,
+				      const struct btf_dump_opts *opts)
 {
 	struct btf_dump *d;
 	int err;
 
+	if (!printf_fn)
+		return libbpf_err_ptr(-EINVAL);
+
 	d = calloc(1, sizeof(struct btf_dump));
 	if (!d)
 		return libbpf_err_ptr(-ENOMEM);
 
 	d->btf = btf;
-	d->btf_ext = btf_ext;
 	d->printf_fn = printf_fn;
-	d->opts.ctx = opts ? opts->ctx : NULL;
+	d->cb_ctx = ctx;
 	d->ptr_sz = btf__pointer_size(btf) ? : sizeof(void *);
 
 	d->type_names = hashmap__new(str_hash_fn, str_equal_fn, NULL);
@@ -186,6 +188,17 @@ err:
 	return libbpf_err_ptr(err);
 }
 
+COMPAT_VERSION(btf_dump__new_deprecated, btf_dump__new, LIBBPF_0.0.4)
+struct btf_dump *btf_dump__new_deprecated(const struct btf *btf,
+					  const struct btf_ext *btf_ext,
+					  const struct btf_dump_opts *opts,
+					  btf_dump_printf_fn_t printf_fn)
+{
+	if (!printf_fn)
+		return libbpf_err_ptr(-EINVAL);
+	return btf_dump__new_v0_6_0(btf, printf_fn, opts ? opts->ctx : NULL, opts);
+}
+
 static int btf_dump_resize(struct btf_dump *d)
 {
 	int err, last_id = btf__type_cnt(d->btf) - 1;
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index b8c8d14621d2..07762d6723ef 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -407,4 +407,6 @@ LIBBPF_0.6.0 {
 		btf__dedup_deprecated;
 		btf__raw_data;
 		btf__type_cnt;
+		btf_dump__new;
+		btf_dump__new_deprecated;
 } LIBBPF_0.5.0;
-- 
cgit v1.2.3


From 4178893465774f91dcd49465ae6f4e3cc036b7b2 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 10 Nov 2021 21:36:20 -0800
Subject: libbpf: Make perf_buffer__new() use OPTS-based interface

Add new variants of perf_buffer__new() and perf_buffer__new_raw() that
use OPTS-based options for future extensibility ([0]). Given all the
currently used API names are best fits, re-use them and use
___libbpf_override() approach and symbol versioning to preserve ABI and
source code compatibility. struct perf_buffer_opts and struct
perf_buffer_raw_opts are kept as well, but they are restructured such
that they are OPTS-based when used with new APIs. For struct
perf_buffer_raw_opts we keep few fields intact, so we have to also
preserve the memory location of them both when used as OPTS and for
legacy API variants. This is achieved with anonymous padding for OPTS
"incarnation" of the struct.  These pads can be eventually used for new
options.

  [0] Closes: https://github.com/libbpf/libbpf/issues/311

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211111053624.190580-6-andrii@kernel.org
---
 tools/lib/bpf/libbpf.c   | 70 ++++++++++++++++++++++++++++++---------
 tools/lib/bpf/libbpf.h   | 86 ++++++++++++++++++++++++++++++++++++++++--------
 tools/lib/bpf/libbpf.map |  4 +++
 3 files changed, 132 insertions(+), 28 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index a823b5ed705b..7db14859b27c 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -10578,11 +10578,18 @@ error:
 static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt,
 					      struct perf_buffer_params *p);
 
-struct perf_buffer *perf_buffer__new(int map_fd, size_t page_cnt,
-				     const struct perf_buffer_opts *opts)
+DEFAULT_VERSION(perf_buffer__new_v0_6_0, perf_buffer__new, LIBBPF_0.6.0)
+struct perf_buffer *perf_buffer__new_v0_6_0(int map_fd, size_t page_cnt,
+					    perf_buffer_sample_fn sample_cb,
+					    perf_buffer_lost_fn lost_cb,
+					    void *ctx,
+					    const struct perf_buffer_opts *opts)
 {
 	struct perf_buffer_params p = {};
-	struct perf_event_attr attr = { 0, };
+	struct perf_event_attr attr = {};
+
+	if (!OPTS_VALID(opts, perf_buffer_opts))
+		return libbpf_err_ptr(-EINVAL);
 
 	attr.config = PERF_COUNT_SW_BPF_OUTPUT;
 	attr.type = PERF_TYPE_SOFTWARE;
@@ -10591,29 +10598,62 @@ struct perf_buffer *perf_buffer__new(int map_fd, size_t page_cnt,
 	attr.wakeup_events = 1;
 
 	p.attr = &attr;
-	p.sample_cb = opts ? opts->sample_cb : NULL;
-	p.lost_cb = opts ? opts->lost_cb : NULL;
-	p.ctx = opts ? opts->ctx : NULL;
+	p.sample_cb = sample_cb;
+	p.lost_cb = lost_cb;
+	p.ctx = ctx;
 
 	return libbpf_ptr(__perf_buffer__new(map_fd, page_cnt, &p));
 }
 
-struct perf_buffer *
-perf_buffer__new_raw(int map_fd, size_t page_cnt,
-		     const struct perf_buffer_raw_opts *opts)
+COMPAT_VERSION(perf_buffer__new_deprecated, perf_buffer__new, LIBBPF_0.0.4)
+struct perf_buffer *perf_buffer__new_deprecated(int map_fd, size_t page_cnt,
+						const struct perf_buffer_opts *opts)
+{
+	return perf_buffer__new_v0_6_0(map_fd, page_cnt,
+				       opts ? opts->sample_cb : NULL,
+				       opts ? opts->lost_cb : NULL,
+				       opts ? opts->ctx : NULL,
+				       NULL);
+}
+
+DEFAULT_VERSION(perf_buffer__new_raw_v0_6_0, perf_buffer__new_raw, LIBBPF_0.6.0)
+struct perf_buffer *perf_buffer__new_raw_v0_6_0(int map_fd, size_t page_cnt,
+						struct perf_event_attr *attr,
+						perf_buffer_event_fn event_cb, void *ctx,
+						const struct perf_buffer_raw_opts *opts)
 {
 	struct perf_buffer_params p = {};
 
-	p.attr = opts->attr;
-	p.event_cb = opts->event_cb;
-	p.ctx = opts->ctx;
-	p.cpu_cnt = opts->cpu_cnt;
-	p.cpus = opts->cpus;
-	p.map_keys = opts->map_keys;
+	if (page_cnt == 0 || !attr)
+		return libbpf_err_ptr(-EINVAL);
+
+	if (!OPTS_VALID(opts, perf_buffer_raw_opts))
+		return libbpf_err_ptr(-EINVAL);
+
+	p.attr = attr;
+	p.event_cb = event_cb;
+	p.ctx = ctx;
+	p.cpu_cnt = OPTS_GET(opts, cpu_cnt, 0);
+	p.cpus = OPTS_GET(opts, cpus, NULL);
+	p.map_keys = OPTS_GET(opts, map_keys, NULL);
 
 	return libbpf_ptr(__perf_buffer__new(map_fd, page_cnt, &p));
 }
 
+COMPAT_VERSION(perf_buffer__new_raw_deprecated, perf_buffer__new_raw, LIBBPF_0.0.4)
+struct perf_buffer *perf_buffer__new_raw_deprecated(int map_fd, size_t page_cnt,
+						    const struct perf_buffer_raw_opts *opts)
+{
+	LIBBPF_OPTS(perf_buffer_raw_opts, inner_opts,
+		.cpu_cnt = opts->cpu_cnt,
+		.cpus = opts->cpus,
+		.map_keys = opts->map_keys,
+	);
+
+	return perf_buffer__new_raw_v0_6_0(map_fd, page_cnt, opts->attr,
+					   opts->event_cb, opts->ctx, &inner_opts);
+}
+
 static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt,
 					      struct perf_buffer_params *p)
 {
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index f69512ae8505..4ec69f224342 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -778,18 +778,52 @@ typedef void (*perf_buffer_lost_fn)(void *ctx, int cpu, __u64 cnt);
 
 /* common use perf buffer options */
 struct perf_buffer_opts {
-	/* if specified, sample_cb is called for each sample */
-	perf_buffer_sample_fn sample_cb;
-	/* if specified, lost_cb is called for each batch of lost samples */
-	perf_buffer_lost_fn lost_cb;
-	/* ctx is provided to sample_cb and lost_cb */
-	void *ctx;
+	union {
+		size_t sz;
+		struct { /* DEPRECATED: will be removed in v1.0 */
+			/* if specified, sample_cb is called for each sample */
+			perf_buffer_sample_fn sample_cb;
+			/* if specified, lost_cb is called for each batch of lost samples */
+			perf_buffer_lost_fn lost_cb;
+			/* ctx is provided to sample_cb and lost_cb */
+			void *ctx;
+		};
+	};
 };
+#define perf_buffer_opts__last_field sz
 
+/**
+ * @brief **perf_buffer__new()** creates BPF perfbuf manager for a specified
+ * BPF_PERF_EVENT_ARRAY map
+ * @param map_fd FD of BPF_PERF_EVENT_ARRAY BPF map that will be used by BPF
+ * code to send data over to user-space
+ * @param page_cnt number of memory pages allocated for each per-CPU buffer
+ * @param sample_cb function called on each received data record
+ * @param lost_cb function called when record loss has occurred
+ * @param ctx user-provided extra context passed into *sample_cb* and *lost_cb*
+ * @return a new instance of struct perf_buffer on success, NULL on error with
+ * *errno* containing an error code
+ */
 LIBBPF_API struct perf_buffer *
 perf_buffer__new(int map_fd, size_t page_cnt,
+		 perf_buffer_sample_fn sample_cb, perf_buffer_lost_fn lost_cb, void *ctx,
 		 const struct perf_buffer_opts *opts);
 
+LIBBPF_API struct perf_buffer *
+perf_buffer__new_v0_6_0(int map_fd, size_t page_cnt,
+			perf_buffer_sample_fn sample_cb, perf_buffer_lost_fn lost_cb, void *ctx,
+			const struct perf_buffer_opts *opts);
+
+LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "use new variant of perf_buffer__new() instead")
+struct perf_buffer *perf_buffer__new_deprecated(int map_fd, size_t page_cnt,
+						const struct perf_buffer_opts *opts);
+
+#define perf_buffer__new(...) ___libbpf_overload(___perf_buffer_new, __VA_ARGS__)
+#define ___perf_buffer_new6(map_fd, page_cnt, sample_cb, lost_cb, ctx, opts) \
+	perf_buffer__new(map_fd, page_cnt, sample_cb, lost_cb, ctx, opts)
+#define ___perf_buffer_new3(map_fd, page_cnt, opts) \
+	perf_buffer__new_deprecated(map_fd, page_cnt, opts)
+
 enum bpf_perf_event_ret {
 	LIBBPF_PERF_EVENT_DONE	= 0,
 	LIBBPF_PERF_EVENT_ERROR	= -1,
@@ -803,12 +837,21 @@ typedef enum bpf_perf_event_ret
 
 /* raw perf buffer options, giving most power and control */
 struct perf_buffer_raw_opts {
-	/* perf event attrs passed directly into perf_event_open() */
-	struct perf_event_attr *attr;
-	/* raw event callback */
-	perf_buffer_event_fn event_cb;
-	/* ctx is provided to event_cb */
-	void *ctx;
+	union {
+		struct {
+			size_t sz;
+			long :0;
+			long :0;
+		};
+		struct { /* DEPRECATED: will be removed in v1.0 */
+			/* perf event attrs passed directly into perf_event_open() */
+			struct perf_event_attr *attr;
+			/* raw event callback */
+			perf_buffer_event_fn event_cb;
+			/* ctx is provided to event_cb */
+			void *ctx;
+		};
+	};
 	/* if cpu_cnt == 0, open all on all possible CPUs (up to the number of
 	 * max_entries of given PERF_EVENT_ARRAY map)
 	 */
@@ -818,11 +861,28 @@ struct perf_buffer_raw_opts {
 	/* if cpu_cnt > 0, map_keys specify map keys to set per-CPU FDs for */
 	int *map_keys;
 };
+#define perf_buffer_raw_opts__last_field map_keys
 
 LIBBPF_API struct perf_buffer *
-perf_buffer__new_raw(int map_fd, size_t page_cnt,
+perf_buffer__new_raw(int map_fd, size_t page_cnt, struct perf_event_attr *attr,
+		     perf_buffer_event_fn event_cb, void *ctx,
 		     const struct perf_buffer_raw_opts *opts);
 
+LIBBPF_API struct perf_buffer *
+perf_buffer__new_raw_v0_6_0(int map_fd, size_t page_cnt, struct perf_event_attr *attr,
+			    perf_buffer_event_fn event_cb, void *ctx,
+			    const struct perf_buffer_raw_opts *opts);
+
+LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "use new variant of perf_buffer__new_raw() instead")
+struct perf_buffer *perf_buffer__new_raw_deprecated(int map_fd, size_t page_cnt,
+						    const struct perf_buffer_raw_opts *opts);
+
+#define perf_buffer__new_raw(...) ___libbpf_overload(___perf_buffer_new_raw, __VA_ARGS__)
+#define ___perf_buffer_new_raw6(map_fd, page_cnt, attr, event_cb, ctx, opts) \
+	perf_buffer__new_raw(map_fd, page_cnt, attr, event_cb, ctx, opts)
+#define ___perf_buffer_new_raw3(map_fd, page_cnt, opts) \
+	perf_buffer__new_raw_deprecated(map_fd, page_cnt, opts)
+
 LIBBPF_API void perf_buffer__free(struct perf_buffer *pb);
 LIBBPF_API int perf_buffer__epoll_fd(const struct perf_buffer *pb);
 LIBBPF_API int perf_buffer__poll(struct perf_buffer *pb, int timeout_ms);
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 07762d6723ef..9e91aa8da303 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -409,4 +409,8 @@ LIBBPF_0.6.0 {
 		btf__type_cnt;
 		btf_dump__new;
 		btf_dump__new_deprecated;
+		perf_buffer__new;
+		perf_buffer__new_deprecated;
+		perf_buffer__new_raw;
+		perf_buffer__new_raw_deprecated;
 } LIBBPF_0.5.0;
-- 
cgit v1.2.3


From 0b52a5f4b994c05070237271c7fac3265b640ffb Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 10 Nov 2021 21:36:21 -0800
Subject: selftests/bpf: Migrate all deprecated perf_buffer uses

Migrate all old-style perf_buffer__new() and perf_buffer__new_raw()
calls to new v1.0+ variants.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211111053624.190580-7-andrii@kernel.org
---
 tools/testing/selftests/bpf/benchs/bench_ringbufs.c       | 8 ++------
 tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c | 5 ++---
 tools/testing/selftests/bpf/prog_tests/kfree_skb.c        | 6 ++----
 tools/testing/selftests/bpf/prog_tests/perf_buffer.c      | 6 ++----
 tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c      | 7 ++-----
 tools/testing/selftests/bpf/test_tcpnotify_user.c         | 4 +---
 6 files changed, 11 insertions(+), 25 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/benchs/bench_ringbufs.c b/tools/testing/selftests/bpf/benchs/bench_ringbufs.c
index d167bffac679..52d4a2f91dbd 100644
--- a/tools/testing/selftests/bpf/benchs/bench_ringbufs.c
+++ b/tools/testing/selftests/bpf/benchs/bench_ringbufs.c
@@ -394,11 +394,6 @@ static void perfbuf_libbpf_setup()
 {
 	struct perfbuf_libbpf_ctx *ctx = &perfbuf_libbpf_ctx;
 	struct perf_event_attr attr;
-	struct perf_buffer_raw_opts pb_opts = {
-		.event_cb = perfbuf_process_sample_raw,
-		.ctx = (void *)(long)0,
-		.attr = &attr,
-	};
 	struct bpf_link *link;
 
 	ctx->skel = perfbuf_setup_skeleton();
@@ -423,7 +418,8 @@ static void perfbuf_libbpf_setup()
 	}
 
 	ctx->perfbuf = perf_buffer__new_raw(bpf_map__fd(ctx->skel->maps.perfbuf),
-					    args.perfbuf_sz, &pb_opts);
+					    args.perfbuf_sz, &attr,
+					    perfbuf_process_sample_raw, NULL, NULL);
 	if (!ctx->perfbuf) {
 		fprintf(stderr, "failed to create perfbuf\n");
 		exit(1);
diff --git a/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c
index 569fcc6ed660..4184c399d4c6 100644
--- a/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c
+++ b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c
@@ -85,7 +85,6 @@ void test_get_stack_raw_tp(void)
 	const char *file_err = "./test_get_stack_rawtp_err.o";
 	const char *prog_name = "raw_tracepoint/sys_enter";
 	int i, err, prog_fd, exp_cnt = MAX_CNT_RAWTP;
-	struct perf_buffer_opts pb_opts = {};
 	struct perf_buffer *pb = NULL;
 	struct bpf_link *link = NULL;
 	struct timespec tv = {0, 10};
@@ -124,8 +123,8 @@ void test_get_stack_raw_tp(void)
 	if (!ASSERT_OK_PTR(link, "attach_raw_tp"))
 		goto close_prog;
 
-	pb_opts.sample_cb = get_stack_print_output;
-	pb = perf_buffer__new(bpf_map__fd(map), 8, &pb_opts);
+	pb = perf_buffer__new(bpf_map__fd(map), 8, get_stack_print_output,
+			      NULL, NULL, NULL);
 	if (!ASSERT_OK_PTR(pb, "perf_buf__new"))
 		goto close_prog;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c
index 885413ed5c96..2a49f8fcde06 100644
--- a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c
+++ b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c
@@ -66,7 +66,6 @@ void serial_test_kfree_skb(void)
 	struct bpf_map *perf_buf_map, *global_data;
 	struct bpf_program *prog, *fentry, *fexit;
 	struct bpf_object *obj, *obj2 = NULL;
-	struct perf_buffer_opts pb_opts = {};
 	struct perf_buffer *pb = NULL;
 	int err, kfree_skb_fd;
 	bool passed = false;
@@ -112,9 +111,8 @@ void serial_test_kfree_skb(void)
 		goto close_prog;
 
 	/* set up perf buffer */
-	pb_opts.sample_cb = on_sample;
-	pb_opts.ctx = &passed;
-	pb = perf_buffer__new(bpf_map__fd(perf_buf_map), 1, &pb_opts);
+	pb = perf_buffer__new(bpf_map__fd(perf_buf_map), 1,
+			      on_sample, NULL, &passed, NULL);
 	if (!ASSERT_OK_PTR(pb, "perf_buf__new"))
 		goto close_prog;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c
index 4e32f3586a75..5fc2b3a0711e 100644
--- a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c
+++ b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c
@@ -47,7 +47,6 @@ void serial_test_perf_buffer(void)
 {
 	int err, on_len, nr_on_cpus = 0, nr_cpus, i, j;
 	int zero = 0, my_pid = getpid();
-	struct perf_buffer_opts pb_opts = {};
 	struct test_perf_buffer *skel;
 	cpu_set_t cpu_seen;
 	struct perf_buffer *pb;
@@ -82,9 +81,8 @@ void serial_test_perf_buffer(void)
 		goto out_close;
 
 	/* set up perf buffer */
-	pb_opts.sample_cb = on_sample;
-	pb_opts.ctx = &cpu_seen;
-	pb = perf_buffer__new(bpf_map__fd(skel->maps.perf_buf_map), 1, &pb_opts);
+	pb = perf_buffer__new(bpf_map__fd(skel->maps.perf_buf_map), 1,
+			      on_sample, NULL, &cpu_seen, NULL);
 	if (!ASSERT_OK_PTR(pb, "perf_buf__new"))
 		goto out_close;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c
index 3bd5904b4db5..f99386d1dc4c 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c
@@ -49,7 +49,6 @@ void test_xdp_bpf2bpf(void)
 	struct vip key4 = {.protocol = 6, .family = AF_INET};
 	struct bpf_program *prog;
 	struct perf_buffer *pb = NULL;
-	struct perf_buffer_opts pb_opts = {};
 
 	/* Load XDP program to introspect */
 	pkt_skel = test_xdp__open_and_load();
@@ -86,10 +85,8 @@ void test_xdp_bpf2bpf(void)
 		goto out;
 
 	/* Set up perf buffer */
-	pb_opts.sample_cb = on_sample;
-	pb_opts.ctx = &passed;
-	pb = perf_buffer__new(bpf_map__fd(ftrace_skel->maps.perf_buf_map),
-			      1, &pb_opts);
+	pb = perf_buffer__new(bpf_map__fd(ftrace_skel->maps.perf_buf_map), 1,
+			      on_sample, NULL, &passed, NULL);
 	if (!ASSERT_OK_PTR(pb, "perf_buf__new"))
 		goto out;
 
diff --git a/tools/testing/selftests/bpf/test_tcpnotify_user.c b/tools/testing/selftests/bpf/test_tcpnotify_user.c
index 63111cb082fe..4c5114765b23 100644
--- a/tools/testing/selftests/bpf/test_tcpnotify_user.c
+++ b/tools/testing/selftests/bpf/test_tcpnotify_user.c
@@ -72,7 +72,6 @@ int main(int argc, char **argv)
 {
 	const char *file = "test_tcpnotify_kern.o";
 	struct bpf_map *perf_map, *global_map;
-	struct perf_buffer_opts pb_opts = {};
 	struct tcpnotify_globals g = {0};
 	struct perf_buffer *pb = NULL;
 	const char *cg_path = "/foo";
@@ -117,8 +116,7 @@ int main(int argc, char **argv)
 		return -1;
 	}
 
-	pb_opts.sample_cb = dummyfn;
-	pb = perf_buffer__new(bpf_map__fd(perf_map), 8, &pb_opts);
+	pb = perf_buffer__new(bpf_map__fd(perf_map), 8, dummyfn, NULL, NULL, NULL);
 	if (!pb)
 		goto err;
 
-- 
cgit v1.2.3


From 60ba87bb6bafaaa6e8ef9a73834cf701194d1923 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 10 Nov 2021 21:36:22 -0800
Subject: selftests/bpf: Update btf_dump__new() uses to v1.0+ variant

Update to-be-deprecated forms of btf_dump__new().

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211111053624.190580-8-andrii@kernel.org
---
 tools/testing/selftests/bpf/btf_helpers.c          |  4 +--
 tools/testing/selftests/bpf/prog_tests/btf_dump.c  | 33 +++++++++-------------
 tools/testing/selftests/bpf/prog_tests/btf_split.c |  4 +--
 3 files changed, 15 insertions(+), 26 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/btf_helpers.c b/tools/testing/selftests/bpf/btf_helpers.c
index 3d1a748d09d8..acb59202486d 100644
--- a/tools/testing/selftests/bpf/btf_helpers.c
+++ b/tools/testing/selftests/bpf/btf_helpers.c
@@ -238,7 +238,6 @@ const char *btf_type_c_dump(const struct btf *btf)
 	static char buf[16 * 1024];
 	FILE *buf_file;
 	struct btf_dump *d = NULL;
-	struct btf_dump_opts opts = {};
 	int err, i;
 
 	buf_file = fmemopen(buf, sizeof(buf) - 1, "w");
@@ -247,8 +246,7 @@ const char *btf_type_c_dump(const struct btf *btf)
 		return NULL;
 	}
 
-	opts.ctx = buf_file;
-	d = btf_dump__new(btf, NULL, &opts, btf_dump_printf);
+	d = btf_dump__new(btf, btf_dump_printf, buf_file, NULL);
 	if (libbpf_get_error(d)) {
 		fprintf(stderr, "Failed to create btf_dump instance: %ld\n", libbpf_get_error(d));
 		goto err_out;
diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
index a04961942dfa..d6272013a5a3 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
@@ -13,25 +13,23 @@ static struct btf_dump_test_case {
 	const char *name;
 	const char *file;
 	bool known_ptr_sz;
-	struct btf_dump_opts opts;
 } btf_dump_test_cases[] = {
-	{"btf_dump: syntax", "btf_dump_test_case_syntax", true, {}},
-	{"btf_dump: ordering", "btf_dump_test_case_ordering", false, {}},
-	{"btf_dump: padding", "btf_dump_test_case_padding", true, {}},
-	{"btf_dump: packing", "btf_dump_test_case_packing", true, {}},
-	{"btf_dump: bitfields", "btf_dump_test_case_bitfields", true, {}},
-	{"btf_dump: multidim", "btf_dump_test_case_multidim", false, {}},
-	{"btf_dump: namespacing", "btf_dump_test_case_namespacing", false, {}},
+	{"btf_dump: syntax", "btf_dump_test_case_syntax", true},
+	{"btf_dump: ordering", "btf_dump_test_case_ordering", false},
+	{"btf_dump: padding", "btf_dump_test_case_padding", true},
+	{"btf_dump: packing", "btf_dump_test_case_packing", true},
+	{"btf_dump: bitfields", "btf_dump_test_case_bitfields", true},
+	{"btf_dump: multidim", "btf_dump_test_case_multidim", false},
+	{"btf_dump: namespacing", "btf_dump_test_case_namespacing", false},
 };
 
-static int btf_dump_all_types(const struct btf *btf,
-			      const struct btf_dump_opts *opts)
+static int btf_dump_all_types(const struct btf *btf, void *ctx)
 {
 	size_t type_cnt = btf__type_cnt(btf);
 	struct btf_dump *d;
 	int err = 0, id;
 
-	d = btf_dump__new(btf, NULL, opts, btf_dump_printf);
+	d = btf_dump__new(btf, btf_dump_printf, ctx, NULL);
 	err = libbpf_get_error(d);
 	if (err)
 		return err;
@@ -88,8 +86,7 @@ static int test_btf_dump_case(int n, struct btf_dump_test_case *t)
 		goto done;
 	}
 
-	t->opts.ctx = f;
-	err = btf_dump_all_types(btf, &t->opts);
+	err = btf_dump_all_types(btf, f);
 	fclose(f);
 	close(fd);
 	if (CHECK(err, "btf_dump", "failure during C dumping: %d\n", err)) {
@@ -137,7 +134,6 @@ static void test_btf_dump_incremental(void)
 {
 	struct btf *btf = NULL;
 	struct btf_dump *d = NULL;
-	struct btf_dump_opts opts;
 	int id, err, i;
 
 	dump_buf_file = open_memstream(&dump_buf, &dump_buf_sz);
@@ -146,8 +142,7 @@ static void test_btf_dump_incremental(void)
 	btf = btf__new_empty();
 	if (!ASSERT_OK_PTR(btf, "new_empty"))
 		goto err_out;
-	opts.ctx = dump_buf_file;
-	d = btf_dump__new(btf, NULL, &opts, btf_dump_printf);
+	d = btf_dump__new(btf, btf_dump_printf, dump_buf_file, NULL);
 	if (!ASSERT_OK(libbpf_get_error(d), "btf_dump__new"))
 		goto err_out;
 
@@ -815,7 +810,6 @@ static void test_btf_datasec(struct btf *btf, struct btf_dump *d, char *str,
 static void test_btf_dump_datasec_data(char *str)
 {
 	struct btf *btf;
-	struct btf_dump_opts opts = { .ctx = str };
 	char license[4] = "GPL";
 	struct btf_dump *d;
 
@@ -823,7 +817,7 @@ static void test_btf_dump_datasec_data(char *str)
 	if (!ASSERT_OK_PTR(btf, "xdping_kern.o BTF not found"))
 		return;
 
-	d = btf_dump__new(btf, NULL, &opts, btf_dump_snprintf);
+	d = btf_dump__new(btf, btf_dump_snprintf, str, NULL);
 	if (!ASSERT_OK_PTR(d, "could not create BTF dump"))
 		goto out;
 
@@ -837,7 +831,6 @@ out:
 
 void test_btf_dump() {
 	char str[STRSIZE];
-	struct btf_dump_opts opts = { .ctx = str };
 	struct btf_dump *d;
 	struct btf *btf;
 	int i;
@@ -857,7 +850,7 @@ void test_btf_dump() {
 	if (!ASSERT_OK_PTR(btf, "no kernel BTF found"))
 		return;
 
-	d = btf_dump__new(btf, NULL, &opts, btf_dump_snprintf);
+	d = btf_dump__new(btf, btf_dump_snprintf, str, NULL);
 	if (!ASSERT_OK_PTR(d, "could not create BTF dump"))
 		return;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/btf_split.c b/tools/testing/selftests/bpf/prog_tests/btf_split.c
index b1ffe61f2aa9..eef1158676ed 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_split.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_split.c
@@ -13,7 +13,6 @@ static void btf_dump_printf(void *ctx, const char *fmt, va_list args)
 }
 
 void test_btf_split() {
-	struct btf_dump_opts opts;
 	struct btf_dump *d = NULL;
 	const struct btf_type *t;
 	struct btf *btf1, *btf2;
@@ -68,8 +67,7 @@ void test_btf_split() {
 	dump_buf_file = open_memstream(&dump_buf, &dump_buf_sz);
 	if (!ASSERT_OK_PTR(dump_buf_file, "dump_memstream"))
 		return;
-	opts.ctx = dump_buf_file;
-	d = btf_dump__new(btf2, NULL, &opts, btf_dump_printf);
+	d = btf_dump__new(btf2, btf_dump_printf, dump_buf_file, NULL);
 	if (!ASSERT_OK_PTR(d, "btf_dump__new"))
 		goto cleanup;
 	for (i = 1; i < btf__type_cnt(btf2); i++) {
-- 
cgit v1.2.3


From eda8bfa5b7c76d332ece1f24a3662ca843fd880a Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 10 Nov 2021 21:36:23 -0800
Subject: tools/runqslower: Update perf_buffer__new() calls

Use v1.0+ compatible variant of perf_buffer__new() call to prepare for
deprecation.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211111053624.190580-9-andrii@kernel.org
---
 tools/bpf/runqslower/runqslower.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/runqslower/runqslower.c b/tools/bpf/runqslower/runqslower.c
index d89715844952..2414cc764461 100644
--- a/tools/bpf/runqslower/runqslower.c
+++ b/tools/bpf/runqslower/runqslower.c
@@ -123,7 +123,6 @@ int main(int argc, char **argv)
 		.parser = parse_arg,
 		.doc = argp_program_doc,
 	};
-	struct perf_buffer_opts pb_opts;
 	struct perf_buffer *pb = NULL;
 	struct runqslower_bpf *obj;
 	int err;
@@ -165,9 +164,8 @@ int main(int argc, char **argv)
 	printf("Tracing run queue latency higher than %llu us\n", env.min_us);
 	printf("%-8s %-16s %-6s %14s\n", "TIME", "COMM", "PID", "LAT(us)");
 
-	pb_opts.sample_cb = handle_event;
-	pb_opts.lost_cb = handle_lost_events;
-	pb = perf_buffer__new(bpf_map__fd(obj->maps.events), 64, &pb_opts);
+	pb = perf_buffer__new(bpf_map__fd(obj->maps.events), 64,
+			      handle_event, handle_lost_events, NULL, NULL);
 	err = libbpf_get_error(pb);
 	if (err) {
 		pb = NULL;
-- 
cgit v1.2.3


From 164b04f27fbd769f57905dfddd2a8953974eeef4 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 10 Nov 2021 21:36:24 -0800
Subject: bpftool: Update btf_dump__new() and perf_buffer__new_raw() calls

Use v1.0-compatible variants of btf_dump and perf_buffer "constructors".
This is also a demonstration of reusing struct perf_buffer_raw_opts as
OPTS-style option struct for new perf_buffer__new_raw() API.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211111053624.190580-10-andrii@kernel.org
---
 tools/bpf/bpftool/btf.c           | 2 +-
 tools/bpf/bpftool/gen.c           | 2 +-
 tools/bpf/bpftool/map_perf_ring.c | 9 +++------
 3 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c
index 015d2758f826..223ac7676027 100644
--- a/tools/bpf/bpftool/btf.c
+++ b/tools/bpf/bpftool/btf.c
@@ -418,7 +418,7 @@ static int dump_btf_c(const struct btf *btf,
 	struct btf_dump *d;
 	int err = 0, i;
 
-	d = btf_dump__new(btf, NULL, NULL, btf_dump_printf);
+	d = btf_dump__new(btf, btf_dump_printf, NULL, NULL);
 	if (IS_ERR(d))
 		return PTR_ERR(d);
 
diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c
index 5c18351290f0..89f0e828bbfa 100644
--- a/tools/bpf/bpftool/gen.c
+++ b/tools/bpf/bpftool/gen.c
@@ -218,7 +218,7 @@ static int codegen_datasecs(struct bpf_object *obj, const char *obj_name)
 	char sec_ident[256], map_ident[256];
 	int i, err = 0;
 
-	d = btf_dump__new(btf, NULL, NULL, codegen_btf_dump_printf);
+	d = btf_dump__new(btf, codegen_btf_dump_printf, NULL, NULL);
 	if (IS_ERR(d))
 		return PTR_ERR(d);
 
diff --git a/tools/bpf/bpftool/map_perf_ring.c b/tools/bpf/bpftool/map_perf_ring.c
index b98ea702d284..6b0c410152de 100644
--- a/tools/bpf/bpftool/map_perf_ring.c
+++ b/tools/bpf/bpftool/map_perf_ring.c
@@ -124,7 +124,7 @@ int do_event_pipe(int argc, char **argv)
 		.wakeup_events = 1,
 	};
 	struct bpf_map_info map_info = {};
-	struct perf_buffer_raw_opts opts = {};
+	LIBBPF_OPTS(perf_buffer_raw_opts, opts);
 	struct event_pipe_ctx ctx = {
 		.all_cpus = true,
 		.cpu = -1,
@@ -190,14 +190,11 @@ int do_event_pipe(int argc, char **argv)
 		ctx.idx = 0;
 	}
 
-	opts.attr = &perf_attr;
-	opts.event_cb = print_bpf_output;
-	opts.ctx = &ctx;
 	opts.cpu_cnt = ctx.all_cpus ? 0 : 1;
 	opts.cpus = &ctx.cpu;
 	opts.map_keys = &ctx.idx;
-
-	pb = perf_buffer__new_raw(map_fd, MMAP_PAGE_CNT, &opts);
+	pb = perf_buffer__new_raw(map_fd, MMAP_PAGE_CNT, &perf_attr,
+				  print_bpf_output, &ctx, &opts);
 	err = libbpf_get_error(pb);
 	if (err) {
 		p_err("failed to create perf buffer: %s (%d)",
-- 
cgit v1.2.3


From 8c42d2fa4eeab6c37a0b1b1aa7a2715248ef4f34 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 11 Nov 2021 17:26:09 -0800
Subject: bpf: Support BTF_KIND_TYPE_TAG for btf_type_tag attributes

LLVM patches ([1] for clang, [2] and [3] for BPF backend)
added support for btf_type_tag attributes. This patch
added support for the kernel.

The main motivation for btf_type_tag is to bring kernel
annotations __user, __rcu etc. to btf. With such information
available in btf, bpf verifier can detect mis-usages
and reject the program. For example, for __user tagged pointer,
developers can then use proper helper like bpf_probe_read_user()
etc. to read the data.

BTF_KIND_TYPE_TAG may also useful for other tracing
facility where instead of to require user to specify
kernel/user address type, the kernel can detect it
by itself with btf.

  [1] https://reviews.llvm.org/D111199
  [2] https://reviews.llvm.org/D113222
  [3] https://reviews.llvm.org/D113496

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211112012609.1505032-1-yhs@fb.com
---
 include/uapi/linux/btf.h       |  3 ++-
 kernel/bpf/btf.c               | 14 +++++++++++++-
 tools/include/uapi/linux/btf.h |  3 ++-
 3 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
index deb12f755f0f..b0d8fea1951d 100644
--- a/include/uapi/linux/btf.h
+++ b/include/uapi/linux/btf.h
@@ -43,7 +43,7 @@ struct btf_type {
 	 * "size" tells the size of the type it is describing.
 	 *
 	 * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT,
-	 * FUNC, FUNC_PROTO, VAR and DECL_TAG.
+	 * FUNC, FUNC_PROTO, VAR, DECL_TAG and TYPE_TAG.
 	 * "type" is a type_id referring to another type.
 	 */
 	union {
@@ -75,6 +75,7 @@ enum {
 	BTF_KIND_DATASEC	= 15,	/* Section	*/
 	BTF_KIND_FLOAT		= 16,	/* Floating point	*/
 	BTF_KIND_DECL_TAG	= 17,	/* Decl Tag */
+	BTF_KIND_TYPE_TAG	= 18,	/* Type Tag */
 
 	NR_BTF_KINDS,
 	BTF_KIND_MAX		= NR_BTF_KINDS - 1,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index cdb0fba65600..1dd9ba82da1e 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -282,6 +282,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
 	[BTF_KIND_DATASEC]	= "DATASEC",
 	[BTF_KIND_FLOAT]	= "FLOAT",
 	[BTF_KIND_DECL_TAG]	= "DECL_TAG",
+	[BTF_KIND_TYPE_TAG]	= "TYPE_TAG",
 };
 
 const char *btf_type_str(const struct btf_type *t)
@@ -418,6 +419,7 @@ static bool btf_type_is_modifier(const struct btf_type *t)
 	case BTF_KIND_VOLATILE:
 	case BTF_KIND_CONST:
 	case BTF_KIND_RESTRICT:
+	case BTF_KIND_TYPE_TAG:
 		return true;
 	}
 
@@ -1737,6 +1739,7 @@ __btf_resolve_size(const struct btf *btf, const struct btf_type *type,
 		case BTF_KIND_VOLATILE:
 		case BTF_KIND_CONST:
 		case BTF_KIND_RESTRICT:
+		case BTF_KIND_TYPE_TAG:
 			id = type->type;
 			type = btf_type_by_id(btf, type->type);
 			break;
@@ -2345,6 +2348,8 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
 				   const struct btf_type *t,
 				   u32 meta_left)
 {
+	const char *value;
+
 	if (btf_type_vlen(t)) {
 		btf_verifier_log_type(env, t, "vlen != 0");
 		return -EINVAL;
@@ -2360,7 +2365,7 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
-	/* typedef type must have a valid name, and other ref types,
+	/* typedef/type_tag type must have a valid name, and other ref types,
 	 * volatile, const, restrict, should have a null name.
 	 */
 	if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF) {
@@ -2369,6 +2374,12 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
 			btf_verifier_log_type(env, t, "Invalid name");
 			return -EINVAL;
 		}
+	} else if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPE_TAG) {
+		value = btf_name_by_offset(env->btf, t->name_off);
+		if (!value || !value[0]) {
+			btf_verifier_log_type(env, t, "Invalid name");
+			return -EINVAL;
+		}
 	} else {
 		if (t->name_off) {
 			btf_verifier_log_type(env, t, "Invalid name");
@@ -4059,6 +4070,7 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
 	[BTF_KIND_DATASEC] = &datasec_ops,
 	[BTF_KIND_FLOAT] = &float_ops,
 	[BTF_KIND_DECL_TAG] = &decl_tag_ops,
+	[BTF_KIND_TYPE_TAG] = &modifier_ops,
 };
 
 static s32 btf_check_meta(struct btf_verifier_env *env,
diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h
index deb12f755f0f..b0d8fea1951d 100644
--- a/tools/include/uapi/linux/btf.h
+++ b/tools/include/uapi/linux/btf.h
@@ -43,7 +43,7 @@ struct btf_type {
 	 * "size" tells the size of the type it is describing.
 	 *
 	 * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT,
-	 * FUNC, FUNC_PROTO, VAR and DECL_TAG.
+	 * FUNC, FUNC_PROTO, VAR, DECL_TAG and TYPE_TAG.
 	 * "type" is a type_id referring to another type.
 	 */
 	union {
@@ -75,6 +75,7 @@ enum {
 	BTF_KIND_DATASEC	= 15,	/* Section	*/
 	BTF_KIND_FLOAT		= 16,	/* Floating point	*/
 	BTF_KIND_DECL_TAG	= 17,	/* Decl Tag */
+	BTF_KIND_TYPE_TAG	= 18,	/* Type Tag */
 
 	NR_BTF_KINDS,
 	BTF_KIND_MAX		= NR_BTF_KINDS - 1,
-- 
cgit v1.2.3


From 2dc1e488e5cdfd937554ca81fd46ad874d244b3f Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 11 Nov 2021 17:26:14 -0800
Subject: libbpf: Support BTF_KIND_TYPE_TAG

Add libbpf support for BTF_KIND_TYPE_TAG.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211112012614.1505315-1-yhs@fb.com
---
 tools/lib/bpf/btf.c             | 23 +++++++++++++++++++++++
 tools/lib/bpf/btf.h             |  9 ++++++++-
 tools/lib/bpf/btf_dump.c        |  9 +++++++++
 tools/lib/bpf/libbpf.c          | 31 ++++++++++++++++++++++++++++++-
 tools/lib/bpf/libbpf.map        |  1 +
 tools/lib/bpf/libbpf_internal.h |  2 ++
 6 files changed, 73 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index fcec27622e7a..fadf089ae8fe 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -299,6 +299,7 @@ static int btf_type_size(const struct btf_type *t)
 	case BTF_KIND_TYPEDEF:
 	case BTF_KIND_FUNC:
 	case BTF_KIND_FLOAT:
+	case BTF_KIND_TYPE_TAG:
 		return base_size;
 	case BTF_KIND_INT:
 		return base_size + sizeof(__u32);
@@ -349,6 +350,7 @@ static int btf_bswap_type_rest(struct btf_type *t)
 	case BTF_KIND_TYPEDEF:
 	case BTF_KIND_FUNC:
 	case BTF_KIND_FLOAT:
+	case BTF_KIND_TYPE_TAG:
 		return 0;
 	case BTF_KIND_INT:
 		*(__u32 *)(t + 1) = bswap_32(*(__u32 *)(t + 1));
@@ -649,6 +651,7 @@ int btf__align_of(const struct btf *btf, __u32 id)
 	case BTF_KIND_VOLATILE:
 	case BTF_KIND_CONST:
 	case BTF_KIND_RESTRICT:
+	case BTF_KIND_TYPE_TAG:
 		return btf__align_of(btf, t->type);
 	case BTF_KIND_ARRAY:
 		return btf__align_of(btf, btf_array(t)->type);
@@ -2235,6 +2238,22 @@ int btf__add_restrict(struct btf *btf, int ref_type_id)
 	return btf_add_ref_kind(btf, BTF_KIND_RESTRICT, NULL, ref_type_id);
 }
 
+/*
+ * Append new BTF_KIND_TYPE_TAG type with:
+ *   - *value*, non-empty/non-NULL tag value;
+ *   - *ref_type_id* - referenced type ID, it might not exist yet;
+ * Returns:
+ *   - >0, type ID of newly added BTF type;
+ *   - <0, on error.
+ */
+int btf__add_type_tag(struct btf *btf, const char *value, int ref_type_id)
+{
+	if (!value|| !value[0])
+		return libbpf_err(-EINVAL);
+
+	return btf_add_ref_kind(btf, BTF_KIND_TYPE_TAG, value, ref_type_id);
+}
+
 /*
  * Append new BTF_KIND_FUNC type with:
  *   - *name*, non-empty/non-NULL name;
@@ -3639,6 +3658,7 @@ static int btf_dedup_prep(struct btf_dedup *d)
 		case BTF_KIND_TYPEDEF:
 		case BTF_KIND_FUNC:
 		case BTF_KIND_FLOAT:
+		case BTF_KIND_TYPE_TAG:
 			h = btf_hash_common(t);
 			break;
 		case BTF_KIND_INT:
@@ -3699,6 +3719,7 @@ static int btf_dedup_prim_type(struct btf_dedup *d, __u32 type_id)
 	case BTF_KIND_VAR:
 	case BTF_KIND_DATASEC:
 	case BTF_KIND_DECL_TAG:
+	case BTF_KIND_TYPE_TAG:
 		return 0;
 
 	case BTF_KIND_INT:
@@ -4297,6 +4318,7 @@ static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id)
 	case BTF_KIND_PTR:
 	case BTF_KIND_TYPEDEF:
 	case BTF_KIND_FUNC:
+	case BTF_KIND_TYPE_TAG:
 		ref_type_id = btf_dedup_ref_type(d, t->type);
 		if (ref_type_id < 0)
 			return ref_type_id;
@@ -4603,6 +4625,7 @@ int btf_type_visit_type_ids(struct btf_type *t, type_id_visit_fn visit, void *ct
 	case BTF_KIND_FUNC:
 	case BTF_KIND_VAR:
 	case BTF_KIND_DECL_TAG:
+	case BTF_KIND_TYPE_TAG:
 		return visit(&t->type, ctx);
 
 	case BTF_KIND_ARRAY: {
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index 45310c65e865..5c73a5b0a044 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -227,6 +227,7 @@ LIBBPF_API int btf__add_typedef(struct btf *btf, const char *name, int ref_type_
 LIBBPF_API int btf__add_volatile(struct btf *btf, int ref_type_id);
 LIBBPF_API int btf__add_const(struct btf *btf, int ref_type_id);
 LIBBPF_API int btf__add_restrict(struct btf *btf, int ref_type_id);
+LIBBPF_API int btf__add_type_tag(struct btf *btf, const char *value, int ref_type_id);
 
 /* func and func_proto construction APIs */
 LIBBPF_API int btf__add_func(struct btf *btf, const char *name,
@@ -458,7 +459,8 @@ static inline bool btf_is_mod(const struct btf_type *t)
 
 	return kind == BTF_KIND_VOLATILE ||
 	       kind == BTF_KIND_CONST ||
-	       kind == BTF_KIND_RESTRICT;
+	       kind == BTF_KIND_RESTRICT ||
+	       kind == BTF_KIND_TYPE_TAG;
 }
 
 static inline bool btf_is_func(const struct btf_type *t)
@@ -491,6 +493,11 @@ static inline bool btf_is_decl_tag(const struct btf_type *t)
 	return btf_kind(t) == BTF_KIND_DECL_TAG;
 }
 
+static inline bool btf_is_type_tag(const struct btf_type *t)
+{
+	return btf_kind(t) == BTF_KIND_TYPE_TAG;
+}
+
 static inline __u8 btf_int_encoding(const struct btf_type *t)
 {
 	return BTF_INT_ENCODING(*(__u32 *)(t + 1));
diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c
index b8cd7e4f557a..05f3e7dfec0a 100644
--- a/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c
@@ -330,6 +330,7 @@ static int btf_dump_mark_referenced(struct btf_dump *d)
 		case BTF_KIND_FUNC:
 		case BTF_KIND_VAR:
 		case BTF_KIND_DECL_TAG:
+		case BTF_KIND_TYPE_TAG:
 			d->type_states[t->type].referenced = 1;
 			break;
 
@@ -573,6 +574,7 @@ static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr)
 	case BTF_KIND_VOLATILE:
 	case BTF_KIND_CONST:
 	case BTF_KIND_RESTRICT:
+	case BTF_KIND_TYPE_TAG:
 		return btf_dump_order_type(d, t->type, through_ptr);
 
 	case BTF_KIND_FUNC_PROTO: {
@@ -747,6 +749,7 @@ static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id)
 	case BTF_KIND_VOLATILE:
 	case BTF_KIND_CONST:
 	case BTF_KIND_RESTRICT:
+	case BTF_KIND_TYPE_TAG:
 		btf_dump_emit_type(d, t->type, cont_id);
 		break;
 	case BTF_KIND_ARRAY:
@@ -1167,6 +1170,7 @@ skip_mod:
 		case BTF_KIND_CONST:
 		case BTF_KIND_RESTRICT:
 		case BTF_KIND_FUNC_PROTO:
+		case BTF_KIND_TYPE_TAG:
 			id = t->type;
 			break;
 		case BTF_KIND_ARRAY:
@@ -1335,6 +1339,11 @@ static void btf_dump_emit_type_chain(struct btf_dump *d,
 		case BTF_KIND_RESTRICT:
 			btf_dump_printf(d, " restrict");
 			break;
+		case BTF_KIND_TYPE_TAG:
+			btf_dump_emit_mods(d, decls);
+			name = btf_name_of(d, t->name_off);
+			btf_dump_printf(d, " __attribute__((btf_type_tag(\"%s\")))", name);
+			break;
 		case BTF_KIND_ARRAY: {
 			const struct btf_array *a = btf_array(t);
 			const struct btf_type *next_t;
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 7db14859b27c..de7e09a6b5ec 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -197,6 +197,8 @@ enum kern_feature_id {
 	FEAT_PERF_LINK,
 	/* BTF_KIND_DECL_TAG support */
 	FEAT_BTF_DECL_TAG,
+	/* BTF_KIND_TYPE_TAG support */
+	FEAT_BTF_TYPE_TAG,
 	__FEAT_CNT,
 };
 
@@ -2076,6 +2078,7 @@ static const char *__btf_kind_str(__u16 kind)
 	case BTF_KIND_DATASEC: return "datasec";
 	case BTF_KIND_FLOAT: return "float";
 	case BTF_KIND_DECL_TAG: return "decl_tag";
+	case BTF_KIND_TYPE_TAG: return "type_tag";
 	default: return "unknown";
 	}
 }
@@ -2588,8 +2591,10 @@ static bool btf_needs_sanitization(struct bpf_object *obj)
 	bool has_float = kernel_supports(obj, FEAT_BTF_FLOAT);
 	bool has_func = kernel_supports(obj, FEAT_BTF_FUNC);
 	bool has_decl_tag = kernel_supports(obj, FEAT_BTF_DECL_TAG);
+	bool has_type_tag = kernel_supports(obj, FEAT_BTF_TYPE_TAG);
 
-	return !has_func || !has_datasec || !has_func_global || !has_float || !has_decl_tag;
+	return !has_func || !has_datasec || !has_func_global || !has_float ||
+	       !has_decl_tag || !has_type_tag;
 }
 
 static void bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf)
@@ -2599,6 +2604,7 @@ static void bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf)
 	bool has_float = kernel_supports(obj, FEAT_BTF_FLOAT);
 	bool has_func = kernel_supports(obj, FEAT_BTF_FUNC);
 	bool has_decl_tag = kernel_supports(obj, FEAT_BTF_DECL_TAG);
+	bool has_type_tag = kernel_supports(obj, FEAT_BTF_TYPE_TAG);
 	struct btf_type *t;
 	int i, j, vlen;
 
@@ -2657,6 +2663,10 @@ static void bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf)
 			 */
 			t->name_off = 0;
 			t->info = BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 0);
+		} else if (!has_type_tag && btf_is_type_tag(t)) {
+			/* replace TYPE_TAG with a CONST */
+			t->name_off = 0;
+			t->info = BTF_INFO_ENC(BTF_KIND_CONST, 0, 0);
 		}
 	}
 }
@@ -4460,6 +4470,22 @@ static int probe_kern_btf_decl_tag(void)
 					     strs, sizeof(strs)));
 }
 
+static int probe_kern_btf_type_tag(void)
+{
+	static const char strs[] = "\0tag";
+	__u32 types[] = {
+		/* int */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),		/* [1] */
+		/* attr */
+		BTF_TYPE_TYPE_TAG_ENC(1, 1),				/* [2] */
+		/* ptr */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 2),	/* [3] */
+	};
+
+	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
+					     strs, sizeof(strs)));
+}
+
 static int probe_kern_array_mmap(void)
 {
 	struct bpf_create_map_attr attr = {
@@ -4657,6 +4683,9 @@ static struct kern_feature_desc {
 	[FEAT_BTF_DECL_TAG] = {
 		"BTF_KIND_DECL_TAG support", probe_kern_btf_decl_tag,
 	},
+	[FEAT_BTF_TYPE_TAG] = {
+		"BTF_KIND_TYPE_TAG support", probe_kern_btf_type_tag,
+	},
 };
 
 static bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id feat_id)
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 9e91aa8da303..6a59514a48cf 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -403,6 +403,7 @@ LIBBPF_0.6.0 {
 		bpf_program__set_extra_flags;
 		btf__add_btf;
 		btf__add_decl_tag;
+		btf__add_type_tag;
 		btf__dedup;
 		btf__dedup_deprecated;
 		btf__raw_data;
diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index c1e34794b829..f7ac349650a1 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -73,6 +73,8 @@
 	BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_FLOAT, 0, 0), sz)
 #define BTF_TYPE_DECL_TAG_ENC(value, type, component_idx) \
 	BTF_TYPE_ENC(value, BTF_INFO_ENC(BTF_KIND_DECL_TAG, 0, 0), type), (component_idx)
+#define BTF_TYPE_TYPE_TAG_ENC(value, type) \
+	BTF_TYPE_ENC(value, BTF_INFO_ENC(BTF_KIND_TYPE_TAG, 0, 0), type)
 
 #ifndef likely
 #define likely(x) __builtin_expect(!!(x), 1)
-- 
cgit v1.2.3


From 3da5ba6f0509ace03cad38b554c89797129e90be Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 11 Nov 2021 17:26:20 -0800
Subject: bpftool: Support BTF_KIND_TYPE_TAG

Add bpftool support for BTF_KIND_TYPE_TAG.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211112012620.1505506-1-yhs@fb.com
---
 tools/bpf/bpftool/btf.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c
index 223ac7676027..c7e3b0b0029e 100644
--- a/tools/bpf/bpftool/btf.c
+++ b/tools/bpf/bpftool/btf.c
@@ -39,6 +39,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
 	[BTF_KIND_DATASEC]	= "DATASEC",
 	[BTF_KIND_FLOAT]	= "FLOAT",
 	[BTF_KIND_DECL_TAG]	= "DECL_TAG",
+	[BTF_KIND_TYPE_TAG]	= "TYPE_TAG",
 };
 
 struct btf_attach_point {
@@ -142,6 +143,7 @@ static int dump_btf_type(const struct btf *btf, __u32 id,
 	case BTF_KIND_VOLATILE:
 	case BTF_KIND_RESTRICT:
 	case BTF_KIND_TYPEDEF:
+	case BTF_KIND_TYPE_TAG:
 		if (json_output)
 			jsonw_uint_field(w, "type_id", t->type);
 		else
-- 
cgit v1.2.3


From 0dc85872203bf7b15c56c7eb228b8f3fabb17ac2 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 11 Nov 2021 17:26:25 -0800
Subject: selftests/bpf: Test libbpf API function btf__add_type_tag()

Add unit tests for btf__add_type_tag().

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211112012625.1505748-1-yhs@fb.com
---
 tools/testing/selftests/bpf/btf_helpers.c          |  4 +-
 tools/testing/selftests/bpf/prog_tests/btf_write.c | 67 +++++++++++++---------
 2 files changed, 43 insertions(+), 28 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/btf_helpers.c b/tools/testing/selftests/bpf/btf_helpers.c
index acb59202486d..b5941d514e17 100644
--- a/tools/testing/selftests/bpf/btf_helpers.c
+++ b/tools/testing/selftests/bpf/btf_helpers.c
@@ -25,11 +25,12 @@ static const char * const btf_kind_str_mapping[] = {
 	[BTF_KIND_DATASEC]	= "DATASEC",
 	[BTF_KIND_FLOAT]	= "FLOAT",
 	[BTF_KIND_DECL_TAG]	= "DECL_TAG",
+	[BTF_KIND_TYPE_TAG]	= "TYPE_TAG",
 };
 
 static const char *btf_kind_str(__u16 kind)
 {
-	if (kind > BTF_KIND_DECL_TAG)
+	if (kind > BTF_KIND_TYPE_TAG)
 		return "UNKNOWN";
 	return btf_kind_str_mapping[kind];
 }
@@ -109,6 +110,7 @@ int fprintf_btf_type_raw(FILE *out, const struct btf *btf, __u32 id)
 	case BTF_KIND_VOLATILE:
 	case BTF_KIND_RESTRICT:
 	case BTF_KIND_TYPEDEF:
+	case BTF_KIND_TYPE_TAG:
 		fprintf(out, " type_id=%u", t->type);
 		break;
 	case BTF_KIND_ARRAY: {
diff --git a/tools/testing/selftests/bpf/prog_tests/btf_write.c b/tools/testing/selftests/bpf/prog_tests/btf_write.c
index b912eeb0b6b4..addf99c05896 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_write.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_write.c
@@ -297,6 +297,16 @@ static void gen_btf(struct btf *btf)
 	ASSERT_EQ(btf_decl_tag(t)->component_idx, 1, "tag_component_idx");
 	ASSERT_STREQ(btf_type_raw_dump(btf, 19),
 		     "[19] DECL_TAG 'tag2' type_id=14 component_idx=1", "raw_dump");
+
+	/* TYPE_TAG */
+	id = btf__add_type_tag(btf, "tag1", 1);
+	ASSERT_EQ(id, 20, "tag_id");
+	t = btf__type_by_id(btf, 20);
+	ASSERT_STREQ(btf__str_by_offset(btf, t->name_off), "tag1", "tag_value");
+	ASSERT_EQ(btf_kind(t), BTF_KIND_TYPE_TAG, "tag_kind");
+	ASSERT_EQ(t->type, 1, "tag_type");
+	ASSERT_STREQ(btf_type_raw_dump(btf, 20),
+		     "[20] TYPE_TAG 'tag1' type_id=1", "raw_dump");
 }
 
 static void test_btf_add()
@@ -337,7 +347,8 @@ static void test_btf_add()
 		"[17] DATASEC 'datasec1' size=12 vlen=1\n"
 		"\ttype_id=1 offset=4 size=8",
 		"[18] DECL_TAG 'tag1' type_id=16 component_idx=-1",
-		"[19] DECL_TAG 'tag2' type_id=14 component_idx=1");
+		"[19] DECL_TAG 'tag2' type_id=14 component_idx=1",
+		"[20] TYPE_TAG 'tag1' type_id=1");
 
 	btf__free(btf);
 }
@@ -359,7 +370,7 @@ static void test_btf_add_btf()
 	gen_btf(btf2);
 
 	id = btf__add_btf(btf1, btf2);
-	if (!ASSERT_EQ(id, 20, "id"))
+	if (!ASSERT_EQ(id, 21, "id"))
 		goto cleanup;
 
 	VALIDATE_RAW_BTF(
@@ -391,35 +402,37 @@ static void test_btf_add_btf()
 		"\ttype_id=1 offset=4 size=8",
 		"[18] DECL_TAG 'tag1' type_id=16 component_idx=-1",
 		"[19] DECL_TAG 'tag2' type_id=14 component_idx=1",
+		"[20] TYPE_TAG 'tag1' type_id=1",
 
 		/* types appended from the second BTF */
-		"[20] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED",
-		"[21] PTR '(anon)' type_id=20",
-		"[22] CONST '(anon)' type_id=24",
-		"[23] VOLATILE '(anon)' type_id=22",
-		"[24] RESTRICT '(anon)' type_id=23",
-		"[25] ARRAY '(anon)' type_id=21 index_type_id=20 nr_elems=10",
-		"[26] STRUCT 's1' size=8 vlen=2\n"
-		"\t'f1' type_id=20 bits_offset=0\n"
-		"\t'f2' type_id=20 bits_offset=32 bitfield_size=16",
-		"[27] UNION 'u1' size=8 vlen=1\n"
-		"\t'f1' type_id=20 bits_offset=0 bitfield_size=16",
-		"[28] ENUM 'e1' size=4 vlen=2\n"
+		"[21] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED",
+		"[22] PTR '(anon)' type_id=21",
+		"[23] CONST '(anon)' type_id=25",
+		"[24] VOLATILE '(anon)' type_id=23",
+		"[25] RESTRICT '(anon)' type_id=24",
+		"[26] ARRAY '(anon)' type_id=22 index_type_id=21 nr_elems=10",
+		"[27] STRUCT 's1' size=8 vlen=2\n"
+		"\t'f1' type_id=21 bits_offset=0\n"
+		"\t'f2' type_id=21 bits_offset=32 bitfield_size=16",
+		"[28] UNION 'u1' size=8 vlen=1\n"
+		"\t'f1' type_id=21 bits_offset=0 bitfield_size=16",
+		"[29] ENUM 'e1' size=4 vlen=2\n"
 		"\t'v1' val=1\n"
 		"\t'v2' val=2",
-		"[29] FWD 'struct_fwd' fwd_kind=struct",
-		"[30] FWD 'union_fwd' fwd_kind=union",
-		"[31] ENUM 'enum_fwd' size=4 vlen=0",
-		"[32] TYPEDEF 'typedef1' type_id=20",
-		"[33] FUNC 'func1' type_id=34 linkage=global",
-		"[34] FUNC_PROTO '(anon)' ret_type_id=20 vlen=2\n"
-		"\t'p1' type_id=20\n"
-		"\t'p2' type_id=21",
-		"[35] VAR 'var1' type_id=20, linkage=global-alloc",
-		"[36] DATASEC 'datasec1' size=12 vlen=1\n"
-		"\ttype_id=20 offset=4 size=8",
-		"[37] DECL_TAG 'tag1' type_id=35 component_idx=-1",
-		"[38] DECL_TAG 'tag2' type_id=33 component_idx=1");
+		"[30] FWD 'struct_fwd' fwd_kind=struct",
+		"[31] FWD 'union_fwd' fwd_kind=union",
+		"[32] ENUM 'enum_fwd' size=4 vlen=0",
+		"[33] TYPEDEF 'typedef1' type_id=21",
+		"[34] FUNC 'func1' type_id=35 linkage=global",
+		"[35] FUNC_PROTO '(anon)' ret_type_id=21 vlen=2\n"
+		"\t'p1' type_id=21\n"
+		"\t'p2' type_id=22",
+		"[36] VAR 'var1' type_id=21, linkage=global-alloc",
+		"[37] DATASEC 'datasec1' size=12 vlen=1\n"
+		"\ttype_id=21 offset=4 size=8",
+		"[38] DECL_TAG 'tag1' type_id=36 component_idx=-1",
+		"[39] DECL_TAG 'tag2' type_id=34 component_idx=1",
+		"[40] TYPE_TAG 'tag1' type_id=21");
 
 cleanup:
 	btf__free(btf1);
-- 
cgit v1.2.3


From 6aa5dabc9d0ef722905e4ca4f9751d70cf3ec8a4 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 11 Nov 2021 17:26:30 -0800
Subject: selftests/bpf: Add BTF_KIND_TYPE_TAG unit tests

Add BTF_KIND_TYPE_TAG unit tests.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211112012630.1506095-1-yhs@fb.com
---
 tools/testing/selftests/bpf/prog_tests/btf.c | 18 ++++++++++++++++++
 tools/testing/selftests/bpf/test_btf.h       |  3 +++
 2 files changed, 21 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c
index 1e8b36d74df2..88510a2d9858 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf.c
@@ -3939,6 +3939,23 @@ static struct btf_raw_test raw_tests[] = {
 	.btf_load_err = true,
 	.err_str = "Invalid component_idx",
 },
+{
+	.descr = "type_tag test #1",
+	.raw_types = {
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),	/* [1] */
+		BTF_TYPE_TAG_ENC(NAME_TBD, 1),			/* [2] */
+		BTF_PTR_ENC(2),					/* [3] */
+		BTF_END_RAW,
+	},
+	BTF_STR_SEC("\0tag"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "tag_type_check_btf",
+	.key_size = sizeof(int),
+	.value_size = 4,
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 1,
+},
 
 }; /* struct btf_raw_test raw_tests[] */
 
@@ -7222,6 +7239,7 @@ static int btf_type_size(const struct btf_type *t)
 	case BTF_KIND_TYPEDEF:
 	case BTF_KIND_FUNC:
 	case BTF_KIND_FLOAT:
+	case BTF_KIND_TYPE_TAG:
 		return base_size;
 	case BTF_KIND_INT:
 		return base_size + sizeof(__u32);
diff --git a/tools/testing/selftests/bpf/test_btf.h b/tools/testing/selftests/bpf/test_btf.h
index 32c7a57867da..128989bed8b7 100644
--- a/tools/testing/selftests/bpf/test_btf.h
+++ b/tools/testing/selftests/bpf/test_btf.h
@@ -72,4 +72,7 @@
 #define BTF_DECL_TAG_ENC(value, type, component_idx)	\
 	BTF_TYPE_ENC(value, BTF_INFO_ENC(BTF_KIND_DECL_TAG, 0, 0), type), (component_idx)
 
+#define BTF_TYPE_TAG_ENC(value, type)	\
+	BTF_TYPE_ENC(value, BTF_INFO_ENC(BTF_KIND_TYPE_TAG, 0, 0), type)
+
 #endif /* _TEST_BTF_H */
-- 
cgit v1.2.3


From 846f4826d18e660ab668eb26e83c6adf0ceb24d2 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 11 Nov 2021 17:26:35 -0800
Subject: selftests/bpf: Test BTF_KIND_DECL_TAG for deduplication

Add BTF_KIND_TYPE_TAG duplication unit tests.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211112012635.1506853-1-yhs@fb.com
---
 tools/testing/selftests/bpf/prog_tests/btf.c | 139 ++++++++++++++++++++++++++-
 1 file changed, 135 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c
index 88510a2d9858..4aa6343dc4c8 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf.c
@@ -6878,15 +6878,16 @@ static struct btf_dedup_test dedup_tests[] = {
 			BTF_RESTRICT_ENC(8),						/* [11] restrict */
 			BTF_FUNC_PROTO_ENC(1, 2),					/* [12] func_proto */
 				BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
-				BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 8),
+				BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 18),
 			BTF_FUNC_ENC(NAME_TBD, 12),					/* [13] func */
 			BTF_TYPE_FLOAT_ENC(NAME_TBD, 2),				/* [14] float */
 			BTF_DECL_TAG_ENC(NAME_TBD, 13, -1),				/* [15] decl_tag */
 			BTF_DECL_TAG_ENC(NAME_TBD, 13, 1),				/* [16] decl_tag */
 			BTF_DECL_TAG_ENC(NAME_TBD, 7, -1),				/* [17] decl_tag */
+			BTF_TYPE_TAG_ENC(NAME_TBD, 8),					/* [18] type_tag */
 			BTF_END_RAW,
 		},
-		BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M\0N\0O\0P\0Q"),
+		BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M\0N\0O\0P\0Q\0R"),
 	},
 	.expect = {
 		.raw_types = {
@@ -6907,15 +6908,16 @@ static struct btf_dedup_test dedup_tests[] = {
 			BTF_RESTRICT_ENC(8),						/* [11] restrict */
 			BTF_FUNC_PROTO_ENC(1, 2),					/* [12] func_proto */
 				BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
-				BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 8),
+				BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 18),
 			BTF_FUNC_ENC(NAME_TBD, 12),					/* [13] func */
 			BTF_TYPE_FLOAT_ENC(NAME_TBD, 2),				/* [14] float */
 			BTF_DECL_TAG_ENC(NAME_TBD, 13, -1),				/* [15] decl_tag */
 			BTF_DECL_TAG_ENC(NAME_TBD, 13, 1),				/* [16] decl_tag */
 			BTF_DECL_TAG_ENC(NAME_TBD, 7, -1),				/* [17] decl_tag */
+			BTF_TYPE_TAG_ENC(NAME_TBD, 8),					/* [18] type_tag */
 			BTF_END_RAW,
 		},
-		BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M\0N\0O\0P\0Q"),
+		BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M\0N\0O\0P\0Q\0R"),
 	},
 },
 {
@@ -7221,6 +7223,135 @@ static struct btf_dedup_test dedup_tests[] = {
 		BTF_STR_SEC("\0t\0tag1\0tag2\0tag3"),
 	},
 },
+{
+	.descr = "dedup: btf_type_tag #1",
+	.input = {
+		.raw_types = {
+			/* ptr -> tag2 -> tag1 -> int */
+			BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),	/* [1] */
+			BTF_TYPE_TAG_ENC(NAME_NTH(1), 1),		/* [2] */
+			BTF_TYPE_TAG_ENC(NAME_NTH(2), 2),		/* [3] */
+			BTF_PTR_ENC(3),					/* [4] */
+			/* ptr -> tag2 -> tag1 -> int */
+			BTF_TYPE_TAG_ENC(NAME_NTH(1), 1),		/* [5] */
+			BTF_TYPE_TAG_ENC(NAME_NTH(2), 5),		/* [6] */
+			BTF_PTR_ENC(6),					/* [7] */
+			/* ptr -> tag1 -> int */
+			BTF_TYPE_TAG_ENC(NAME_NTH(1), 1),		/* [8] */
+			BTF_PTR_ENC(8),					/* [9] */
+			BTF_END_RAW,
+		},
+		BTF_STR_SEC("\0tag1\0tag2"),
+	},
+	.expect = {
+		.raw_types = {
+			/* ptr -> tag2 -> tag1 -> int */
+			BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),	/* [1] */
+			BTF_TYPE_TAG_ENC(NAME_NTH(1), 1),		/* [2] */
+			BTF_TYPE_TAG_ENC(NAME_NTH(2), 2),		/* [3] */
+			BTF_PTR_ENC(3),					/* [4] */
+			/* ptr -> tag1 -> int */
+			BTF_PTR_ENC(2),					/* [5] */
+			BTF_END_RAW,
+		},
+		BTF_STR_SEC("\0tag1\0tag2"),
+	},
+},
+{
+	.descr = "dedup: btf_type_tag #2",
+	.input = {
+		.raw_types = {
+			/* ptr -> tag2 -> tag1 -> int */
+			BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),	/* [1] */
+			BTF_TYPE_TAG_ENC(NAME_NTH(1), 1),		/* [2] */
+			BTF_TYPE_TAG_ENC(NAME_NTH(2), 2),		/* [3] */
+			BTF_PTR_ENC(3),					/* [4] */
+			/* ptr -> tag2 -> int */
+			BTF_TYPE_TAG_ENC(NAME_NTH(2), 1),		/* [5] */
+			BTF_PTR_ENC(5),					/* [6] */
+			BTF_END_RAW,
+		},
+		BTF_STR_SEC("\0tag1\0tag2"),
+	},
+	.expect = {
+		.raw_types = {
+			/* ptr -> tag2 -> tag1 -> int */
+			BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),	/* [1] */
+			BTF_TYPE_TAG_ENC(NAME_NTH(1), 1),		/* [2] */
+			BTF_TYPE_TAG_ENC(NAME_NTH(2), 2),		/* [3] */
+			BTF_PTR_ENC(3),					/* [4] */
+			/* ptr -> tag2 -> int */
+			BTF_TYPE_TAG_ENC(NAME_NTH(2), 1),		/* [5] */
+			BTF_PTR_ENC(5),					/* [6] */
+			BTF_END_RAW,
+		},
+		BTF_STR_SEC("\0tag1\0tag2"),
+	},
+},
+{
+	.descr = "dedup: btf_type_tag #3",
+	.input = {
+		.raw_types = {
+			/* ptr -> tag2 -> tag1 -> int */
+			BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),	/* [1] */
+			BTF_TYPE_TAG_ENC(NAME_NTH(1), 1),		/* [2] */
+			BTF_TYPE_TAG_ENC(NAME_NTH(2), 2),		/* [3] */
+			BTF_PTR_ENC(3),					/* [4] */
+			/* ptr -> tag1 -> tag2 -> int */
+			BTF_TYPE_TAG_ENC(NAME_NTH(2), 1),		/* [5] */
+			BTF_TYPE_TAG_ENC(NAME_NTH(1), 5),		/* [6] */
+			BTF_PTR_ENC(6),					/* [7] */
+			BTF_END_RAW,
+		},
+		BTF_STR_SEC("\0tag1\0tag2"),
+	},
+	.expect = {
+		.raw_types = {
+			/* ptr -> tag2 -> tag1 -> int */
+			BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),	/* [1] */
+			BTF_TYPE_TAG_ENC(NAME_NTH(1), 1),		/* [2] */
+			BTF_TYPE_TAG_ENC(NAME_NTH(2), 2),		/* [3] */
+			BTF_PTR_ENC(3),					/* [4] */
+			/* ptr -> tag1 -> tag2 -> int */
+			BTF_TYPE_TAG_ENC(NAME_NTH(2), 1),		/* [5] */
+			BTF_TYPE_TAG_ENC(NAME_NTH(1), 5),		/* [6] */
+			BTF_PTR_ENC(6),					/* [7] */
+			BTF_END_RAW,
+		},
+		BTF_STR_SEC("\0tag1\0tag2"),
+	},
+},
+{
+	.descr = "dedup: btf_type_tag #4",
+	.input = {
+		.raw_types = {
+			/* ptr -> tag1 -> int */
+			BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),	/* [1] */
+			BTF_TYPE_TAG_ENC(NAME_NTH(1), 1),		/* [2] */
+			BTF_PTR_ENC(2),					/* [3] */
+			/* ptr -> tag1 -> long */
+			BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 64, 8),	/* [4] */
+			BTF_TYPE_TAG_ENC(NAME_NTH(1), 4),		/* [5] */
+			BTF_PTR_ENC(5),					/* [6] */
+			BTF_END_RAW,
+		},
+		BTF_STR_SEC("\0tag1"),
+	},
+	.expect = {
+		.raw_types = {
+			/* ptr -> tag1 -> int */
+			BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),	/* [1] */
+			BTF_TYPE_TAG_ENC(NAME_NTH(1), 1),		/* [2] */
+			BTF_PTR_ENC(2),					/* [3] */
+			/* ptr -> tag1 -> long */
+			BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 64, 8),	/* [4] */
+			BTF_TYPE_TAG_ENC(NAME_NTH(1), 4),		/* [5] */
+			BTF_PTR_ENC(5),					/* [6] */
+			BTF_END_RAW,
+		},
+		BTF_STR_SEC("\0tag1"),
+	},
+},
 
 };
 
-- 
cgit v1.2.3


From 26c79fcbfa64b18ca1407a3be7ac3442aef51073 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 11 Nov 2021 17:26:41 -0800
Subject: selftests/bpf: Rename progs/tag.c to progs/btf_decl_tag.c

Rename progs/tag.c to progs/btf_decl_tag.c so we can introduce
progs/btf_type_tag.c in the next patch.

Also create a subtest for btf_decl_tag in prog_tests/btf_tag.c
so we can introduce btf_type_tag subtest in the next patch.

I also took opportunity to remove the check whether __has_attribute
is defined or not in progs/btf_decl_tag.c since all recent
clangs should already support this macro.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211112012641.1507144-1-yhs@fb.com
---
 tools/testing/selftests/bpf/prog_tests/btf_tag.c | 20 ++++++---
 tools/testing/selftests/bpf/progs/btf_decl_tag.c | 50 ++++++++++++++++++++++
 tools/testing/selftests/bpf/progs/tag.c          | 54 ------------------------
 3 files changed, 63 insertions(+), 61 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/progs/btf_decl_tag.c
 delete mode 100644 tools/testing/selftests/bpf/progs/tag.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/btf_tag.c b/tools/testing/selftests/bpf/prog_tests/btf_tag.c
index 91821f42714d..d15cc7a88182 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_tag.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_tag.c
@@ -1,20 +1,26 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2021 Facebook */
 #include <test_progs.h>
-#include "tag.skel.h"
+#include "btf_decl_tag.skel.h"
 
-void test_btf_tag(void)
+static void test_btf_decl_tag(void)
 {
-	struct tag *skel;
+	struct btf_decl_tag *skel;
 
-	skel = tag__open_and_load();
-	if (!ASSERT_OK_PTR(skel, "btf_tag"))
+	skel = btf_decl_tag__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "btf_decl_tag"))
 		return;
 
 	if (skel->rodata->skip_tests) {
-		printf("%s:SKIP: btf_tag attribute not supported", __func__);
+		printf("%s:SKIP: btf_decl_tag attribute not supported", __func__);
 		test__skip();
 	}
 
-	tag__destroy(skel);
+	btf_decl_tag__destroy(skel);
+}
+
+void test_btf_tag(void)
+{
+	if (test__start_subtest("btf_decl_tag"))
+		test_btf_decl_tag();
 }
diff --git a/tools/testing/selftests/bpf/progs/btf_decl_tag.c b/tools/testing/selftests/bpf/progs/btf_decl_tag.c
new file mode 100644
index 000000000000..c88ccc53529a
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf_decl_tag.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#if __has_attribute(btf_decl_tag)
+#define __tag1 __attribute__((btf_decl_tag("tag1")))
+#define __tag2 __attribute__((btf_decl_tag("tag2")))
+volatile const bool skip_tests __tag1 __tag2 = false;
+#else
+#define __tag1
+#define __tag2
+volatile const bool skip_tests = true;
+#endif
+
+struct key_t {
+	int a;
+	int b __tag1 __tag2;
+	int c;
+} __tag1 __tag2;
+
+typedef struct {
+	int a;
+	int b;
+} value_t __tag1 __tag2;
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, 3);
+	__type(key, struct key_t);
+	__type(value, value_t);
+} hashmap1 SEC(".maps");
+
+
+static __noinline int foo(int x __tag1 __tag2) __tag1 __tag2
+{
+	struct key_t key;
+	value_t val = {};
+
+	key.a = key.b = key.c = x;
+	bpf_map_update_elem(&hashmap1, &key, &val, 0);
+	return 0;
+}
+
+SEC("fentry/bpf_fentry_test1")
+int BPF_PROG(sub, int x)
+{
+	return foo(x);
+}
diff --git a/tools/testing/selftests/bpf/progs/tag.c b/tools/testing/selftests/bpf/progs/tag.c
deleted file mode 100644
index 1792f4eda095..000000000000
--- a/tools/testing/selftests/bpf/progs/tag.c
+++ /dev/null
@@ -1,54 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2021 Facebook */
-#include "vmlinux.h"
-#include <bpf/bpf_helpers.h>
-#include <bpf/bpf_tracing.h>
-
-#ifndef __has_attribute
-#define __has_attribute(x) 0
-#endif
-
-#if __has_attribute(btf_decl_tag)
-#define __tag1 __attribute__((btf_decl_tag("tag1")))
-#define __tag2 __attribute__((btf_decl_tag("tag2")))
-volatile const bool skip_tests __tag1 __tag2 = false;
-#else
-#define __tag1
-#define __tag2
-volatile const bool skip_tests = true;
-#endif
-
-struct key_t {
-	int a;
-	int b __tag1 __tag2;
-	int c;
-} __tag1 __tag2;
-
-typedef struct {
-	int a;
-	int b;
-} value_t __tag1 __tag2;
-
-struct {
-	__uint(type, BPF_MAP_TYPE_HASH);
-	__uint(max_entries, 3);
-	__type(key, struct key_t);
-	__type(value, value_t);
-} hashmap1 SEC(".maps");
-
-
-static __noinline int foo(int x __tag1 __tag2) __tag1 __tag2
-{
-	struct key_t key;
-	value_t val = {};
-
-	key.a = key.b = key.c = x;
-	bpf_map_update_elem(&hashmap1, &key, &val, 0);
-	return 0;
-}
-
-SEC("fentry/bpf_fentry_test1")
-int BPF_PROG(sub, int x)
-{
-	return foo(x);
-}
-- 
cgit v1.2.3


From 5698a42a73a1d9cb7efd31ca1bf35daa87f5e1a9 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 11 Nov 2021 17:26:46 -0800
Subject: selftests/bpf: Add a C test for btf_type_tag

The following is the main btf_type_tag usage in the
C test:
  #define __tag1 __attribute__((btf_type_tag("tag1")))
  #define __tag2 __attribute__((btf_type_tag("tag2")))
  struct btf_type_tag_test {
       int __tag1 * __tag1 __tag2 *p;
  } g;

The bpftool raw dump with related types:
  [4] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED
  [11] STRUCT 'btf_type_tag_test' size=8 vlen=1
          'p' type_id=14 bits_offset=0
  [12] TYPE_TAG 'tag1' type_id=16
  [13] TYPE_TAG 'tag2' type_id=12
  [14] PTR '(anon)' type_id=13
  [15] TYPE_TAG 'tag1' type_id=4
  [16] PTR '(anon)' type_id=15
  [17] VAR 'g' type_id=11, linkage=global

With format C dump, we have
  struct btf_type_tag_test {
        int __attribute__((btf_type_tag("tag1"))) * __attribute__((btf_type_tag("tag1"))) __attribute__((btf_type_tag("tag2"))) *p;
  };
The result C code is identical to the original definition except macro's are gone.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211112012646.1508231-1-yhs@fb.com
---
 tools/testing/selftests/bpf/prog_tests/btf_tag.c | 24 +++++++++++++++++++++++
 tools/testing/selftests/bpf/progs/btf_type_tag.c | 25 ++++++++++++++++++++++++
 2 files changed, 49 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/btf_type_tag.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/btf_tag.c b/tools/testing/selftests/bpf/prog_tests/btf_tag.c
index d15cc7a88182..88d63e23e35f 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_tag.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_tag.c
@@ -3,6 +3,12 @@
 #include <test_progs.h>
 #include "btf_decl_tag.skel.h"
 
+/* struct btf_type_tag_test is referenced in btf_type_tag.skel.h */
+struct btf_type_tag_test {
+        int **p;
+};
+#include "btf_type_tag.skel.h"
+
 static void test_btf_decl_tag(void)
 {
 	struct btf_decl_tag *skel;
@@ -19,8 +25,26 @@ static void test_btf_decl_tag(void)
 	btf_decl_tag__destroy(skel);
 }
 
+static void test_btf_type_tag(void)
+{
+	struct btf_type_tag *skel;
+
+	skel = btf_type_tag__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "btf_type_tag"))
+		return;
+
+	if (skel->rodata->skip_tests) {
+		printf("%s:SKIP: btf_type_tag attribute not supported", __func__);
+		test__skip();
+	}
+
+	btf_type_tag__destroy(skel);
+}
+
 void test_btf_tag(void)
 {
 	if (test__start_subtest("btf_decl_tag"))
 		test_btf_decl_tag();
+	if (test__start_subtest("btf_type_tag"))
+		test_btf_type_tag();
 }
diff --git a/tools/testing/selftests/bpf/progs/btf_type_tag.c b/tools/testing/selftests/bpf/progs/btf_type_tag.c
new file mode 100644
index 000000000000..1d488da7e920
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf_type_tag.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#if __has_attribute(btf_type_tag)
+#define __tag1 __attribute__((btf_type_tag("tag1")))
+#define __tag2 __attribute__((btf_type_tag("tag2")))
+volatile const bool skip_tests = false;
+#else
+#define __tag1
+#define __tag2
+volatile const bool skip_tests = true;
+#endif
+
+struct btf_type_tag_test {
+	int __tag1 * __tag1 __tag2 *p;
+} g;
+
+SEC("fentry/bpf_fentry_test1")
+int BPF_PROG(sub, int x)
+{
+  return 0;
+}
-- 
cgit v1.2.3


From 3f1d0dc0ba290aab357083a0abfe267c8cffdc8d Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 11 Nov 2021 17:26:51 -0800
Subject: selftests/bpf: Clarify llvm dependency with btf_tag selftest

btf_tag selftest needs certain llvm versions (>= llvm14).
Make it clear in the selftests README.rst file.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211112012651.1508549-1-yhs@fb.com
---
 tools/testing/selftests/bpf/README.rst | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst
index 5e287e445f75..42ef250c7acc 100644
--- a/tools/testing/selftests/bpf/README.rst
+++ b/tools/testing/selftests/bpf/README.rst
@@ -204,16 +204,17 @@ __ https://reviews.llvm.org/D93563
 btf_tag test and Clang version
 ==============================
 
-The btf_tag selftest require LLVM support to recognize the btf_decl_tag attribute.
-It was introduced in `Clang 14`__.
+The btf_tag selftest requires LLVM support to recognize the btf_decl_tag and
+btf_type_tag attributes. They are introduced in `Clang 14` [0_, 1_].
 
-Without it, the btf_tag selftest will be skipped and you will observe:
+Without them, the btf_tag selftest will be skipped and you will observe:
 
 .. code-block:: console
 
   #<test_num> btf_tag:SKIP
 
-__ https://reviews.llvm.org/D111588
+.. _0: https://reviews.llvm.org/D111588
+.. _1: https://reviews.llvm.org/D111199
 
 Clang dependencies for static linking tests
 ===========================================
-- 
cgit v1.2.3


From 314f14abdeca78de6b16f97d796a9966ce4b90ae Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Wed, 10 Nov 2021 11:23:24 -0800
Subject: bpftool: Enable libbpf's strict mode by default

Otherwise, attaching with bpftool doesn't work with strict section names.

Also:

  - Add --legacy option to switch back to pre-1.0 behavior
  - Print a warning when program fails to load in strict mode to
    point to --legacy flag
  - By default, don't append / to the section name; in strict
    mode it's relevant only for a small subset of prog types

+ bpftool --legacy prog loadall tools/testing/selftests/bpf/test_cgroup_link.o /sys/fs/bpf/kprobe type kprobe
libbpf: failed to pin program: File exists
Error: failed to pin all programs
+ bpftool prog loadall tools/testing/selftests/bpf/test_cgroup_link.o /sys/fs/bpf/kprobe type kprobe

v1 -> v2:
  - strict by default (Quentin Monnet)
  - add more info to --legacy description (Quentin Monnet)
  - add bash completion (Quentin Monnet)

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Quentin Monnet <quentin@isovalent.com>
Link: https://lore.kernel.org/bpf/20211110192324.920934-1-sdf@google.com
---
 tools/bpf/bpftool/Documentation/common_options.rst |  9 +++++
 tools/bpf/bpftool/bash-completion/bpftool          |  2 +-
 tools/bpf/bpftool/main.c                           | 13 ++++++-
 tools/bpf/bpftool/main.h                           |  3 +-
 tools/bpf/bpftool/prog.c                           | 40 +++++++++++++---------
 5 files changed, 48 insertions(+), 19 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/Documentation/common_options.rst b/tools/bpf/bpftool/Documentation/common_options.rst
index 05d06c74dcaa..75adf23202d8 100644
--- a/tools/bpf/bpftool/Documentation/common_options.rst
+++ b/tools/bpf/bpftool/Documentation/common_options.rst
@@ -20,3 +20,12 @@
 	  Print all logs available, even debug-level information. This includes
 	  logs from libbpf as well as from the verifier, when attempting to
 	  load programs.
+
+-l, --legacy
+	  Use legacy libbpf mode which has more relaxed BPF program
+	  requirements. By default, bpftool has more strict requirements
+	  about section names, changes pinning logic and doesn't support
+	  some of the older non-BTF map declarations.
+
+	  See https://github.com/libbpf/libbpf/wiki/Libbpf:-the-road-to-v1.0
+	  for details.
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index 88e2bcf16cca..4a1b02ff72c1 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -261,7 +261,7 @@ _bpftool()
     # Deal with options
     if [[ ${words[cword]} == -* ]]; then
         local c='--version --json --pretty --bpffs --mapcompat --debug \
-	       --use-loader --base-btf'
+	       --use-loader --base-btf --legacy'
         COMPREPLY=( $( compgen -W "$c" -- "$cur" ) )
         return 0
     fi
diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
index 28237d7cef67..473791e87f7d 100644
--- a/tools/bpf/bpftool/main.c
+++ b/tools/bpf/bpftool/main.c
@@ -31,6 +31,7 @@ bool block_mount;
 bool verifier_logs;
 bool relaxed_maps;
 bool use_loader;
+bool legacy_libbpf;
 struct btf *base_btf;
 struct hashmap *refs_table;
 
@@ -396,6 +397,7 @@ int main(int argc, char **argv)
 		{ "debug",	no_argument,	NULL,	'd' },
 		{ "use-loader",	no_argument,	NULL,	'L' },
 		{ "base-btf",	required_argument, NULL, 'B' },
+		{ "legacy",	no_argument,	NULL,	'l' },
 		{ 0 }
 	};
 	int opt, ret;
@@ -408,7 +410,7 @@ int main(int argc, char **argv)
 	bin_name = argv[0];
 
 	opterr = 0;
-	while ((opt = getopt_long(argc, argv, "VhpjfLmndB:",
+	while ((opt = getopt_long(argc, argv, "VhpjfLmndB:l",
 				  options, NULL)) >= 0) {
 		switch (opt) {
 		case 'V':
@@ -454,6 +456,9 @@ int main(int argc, char **argv)
 		case 'L':
 			use_loader = true;
 			break;
+		case 'l':
+			legacy_libbpf = true;
+			break;
 		default:
 			p_err("unrecognized option '%s'", argv[optind - 1]);
 			if (json_output)
@@ -463,6 +468,12 @@ int main(int argc, char **argv)
 		}
 	}
 
+	if (!legacy_libbpf) {
+		ret = libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+		if (ret)
+			p_err("failed to enable libbpf strict mode: %d", ret);
+	}
+
 	argc -= optind;
 	argv += optind;
 	if (argc < 0)
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 383835c2604d..8d76d937a62b 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -57,7 +57,7 @@ static inline void *u64_to_ptr(__u64 ptr)
 #define HELP_SPEC_PROGRAM						\
 	"PROG := { id PROG_ID | pinned FILE | tag PROG_TAG | name PROG_NAME }"
 #define HELP_SPEC_OPTIONS						\
-	"OPTIONS := { {-j|--json} [{-p|--pretty}] | {-d|--debug}"
+	"OPTIONS := { {-j|--json} [{-p|--pretty}] | {-d|--debug} | {-l|--legacy}"
 #define HELP_SPEC_MAP							\
 	"MAP := { id MAP_ID | pinned FILE | name MAP_NAME }"
 #define HELP_SPEC_LINK							\
@@ -90,6 +90,7 @@ extern bool block_mount;
 extern bool verifier_logs;
 extern bool relaxed_maps;
 extern bool use_loader;
+extern bool legacy_libbpf;
 extern struct btf *base_btf;
 extern struct hashmap *refs_table;
 
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index dea7a49ec26e..bf85c914f2fa 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -1483,8 +1483,6 @@ static int load_with_options(int argc, char **argv, bool first_prog_only)
 
 	while (argc) {
 		if (is_prefix(*argv, "type")) {
-			char *type;
-
 			NEXT_ARG();
 
 			if (common_prog_type != BPF_PROG_TYPE_UNSPEC) {
@@ -1494,21 +1492,26 @@ static int load_with_options(int argc, char **argv, bool first_prog_only)
 			if (!REQ_ARGS(1))
 				goto err_free_reuse_maps;
 
-			/* Put a '/' at the end of type to appease libbpf */
-			type = malloc(strlen(*argv) + 2);
-			if (!type) {
-				p_err("mem alloc failed");
-				goto err_free_reuse_maps;
-			}
-			*type = 0;
-			strcat(type, *argv);
-			strcat(type, "/");
+			err = libbpf_prog_type_by_name(*argv, &common_prog_type,
+						       &expected_attach_type);
+			if (err < 0) {
+				/* Put a '/' at the end of type to appease libbpf */
+				char *type = malloc(strlen(*argv) + 2);
 
-			err = get_prog_type_by_name(type, &common_prog_type,
-						    &expected_attach_type);
-			free(type);
-			if (err < 0)
-				goto err_free_reuse_maps;
+				if (!type) {
+					p_err("mem alloc failed");
+					goto err_free_reuse_maps;
+				}
+				*type = 0;
+				strcat(type, *argv);
+				strcat(type, "/");
+
+				err = get_prog_type_by_name(type, &common_prog_type,
+							    &expected_attach_type);
+				free(type);
+				if (err < 0)
+					goto err_free_reuse_maps;
+			}
 
 			NEXT_ARG();
 		} else if (is_prefix(*argv, "map")) {
@@ -1731,6 +1734,11 @@ err_unpin:
 	else
 		bpf_object__unpin_programs(obj, pinfile);
 err_close_obj:
+	if (!legacy_libbpf) {
+		p_info("Warning: bpftool is now running in libbpf strict mode and has more stringent requirements about BPF programs.\n"
+		       "If it used to work for this object file but now doesn't, see --legacy option for more details.\n");
+	}
+
 	bpf_object__close(obj);
 err_free_reuse_maps:
 	for (i = 0; i < old_map_fds; i++)
-- 
cgit v1.2.3


From 21c6ec3d5275a77348b1af0e78cbbed0ee1558d4 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Fri, 12 Nov 2021 12:48:33 -0800
Subject: selftests/bpf: Fix an unused-but-set-variable compiler warning

When using clang to build selftests with LLVM=1 in make commandline,
I hit the following compiler warning:
  xdpxceiver.c:747:6: warning: variable 'total' set but not used [-Wunused-but-set-variable]
          u32 total = 0;
              ^

This patch fixed the issue by removing that declaration and its
assocatied unused operation.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211112204833.3579457-1-yhs@fb.com
---
 tools/testing/selftests/bpf/xdpxceiver.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c
index 6c7cf8aadc79..fe7f423b8c3f 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.c
+++ b/tools/testing/selftests/bpf/xdpxceiver.c
@@ -744,7 +744,6 @@ static void receive_pkts(struct pkt_stream *pkt_stream, struct xsk_socket_info *
 	struct pkt *pkt = pkt_stream_get_next_rx_pkt(pkt_stream);
 	struct xsk_umem_info *umem = xsk->umem;
 	u32 idx_rx = 0, idx_fq = 0, rcvd, i;
-	u32 total = 0;
 	int ret;
 
 	while (pkt) {
@@ -799,7 +798,6 @@ static void receive_pkts(struct pkt_stream *pkt_stream, struct xsk_socket_info *
 
 		pthread_mutex_lock(&pacing_mutex);
 		pkts_in_flight -= rcvd;
-		total += rcvd;
 		if (pkts_in_flight < umem->num_frames)
 			pthread_cond_signal(&pacing_cond);
 		pthread_mutex_unlock(&pacing_mutex);
-- 
cgit v1.2.3


From 325d956d67178af92b5b12ff950a2f93a433f2c4 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Fri, 12 Nov 2021 12:48:38 -0800
Subject: selftests/bpf: Fix a tautological-constant-out-of-range-compare
 compiler warning

When using clang to build selftests with LLVM=1 in make commandline,
I hit the following compiler warning:

  benchs/bench_bloom_filter_map.c:84:46: warning: result of comparison of constant 256
    with expression of type '__u8' (aka 'unsigned char') is always false
    [-Wtautological-constant-out-of-range-compare]
                if (args.value_size < 2 || args.value_size > 256) {
                                           ~~~~~~~~~~~~~~~ ^ ~~~

The reason is arg.vaue_size has type __u8, so comparison "args.value_size > 256"
is always false.

This patch fixed the issue by doing proper comparison before assigning the
value to args.value_size. The patch also fixed the same issue in two
other places.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211112204838.3579953-1-yhs@fb.com
---
 .../selftests/bpf/benchs/bench_bloom_filter_map.c       | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/benchs/bench_bloom_filter_map.c b/tools/testing/selftests/bpf/benchs/bench_bloom_filter_map.c
index 6eeeed2913e6..5bcb8a8cdeb2 100644
--- a/tools/testing/selftests/bpf/benchs/bench_bloom_filter_map.c
+++ b/tools/testing/selftests/bpf/benchs/bench_bloom_filter_map.c
@@ -63,29 +63,34 @@ static const struct argp_option opts[] = {
 
 static error_t parse_arg(int key, char *arg, struct argp_state *state)
 {
+	long ret;
+
 	switch (key) {
 	case ARG_NR_ENTRIES:
-		args.nr_entries = strtol(arg, NULL, 10);
-		if (args.nr_entries == 0) {
+		ret = strtol(arg, NULL, 10);
+		if (ret < 1 || ret > UINT_MAX) {
 			fprintf(stderr, "Invalid nr_entries count.");
 			argp_usage(state);
 		}
+		args.nr_entries = ret;
 		break;
 	case ARG_NR_HASH_FUNCS:
-		args.nr_hash_funcs = strtol(arg, NULL, 10);
-		if (args.nr_hash_funcs == 0 || args.nr_hash_funcs > 15) {
+		ret = strtol(arg, NULL, 10);
+		if (ret < 1 || ret > 15) {
 			fprintf(stderr,
 				"The bloom filter must use 1 to 15 hash functions.");
 			argp_usage(state);
 		}
+		args.nr_hash_funcs = ret;
 		break;
 	case ARG_VALUE_SIZE:
-		args.value_size = strtol(arg, NULL, 10);
-		if (args.value_size < 2 || args.value_size > 256) {
+		ret = strtol(arg, NULL, 10);
+		if (ret < 2 || ret > 256) {
 			fprintf(stderr,
 				"Invalid value size. Must be between 2 and 256 bytes");
 			argp_usage(state);
 		}
+		args.value_size = ret;
 		break;
 	default:
 		return ARGP_ERR_UNKNOWN;
-- 
cgit v1.2.3


From ebbd7f64a3fbe9e0f235e39fc244ee9735e2a52a Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin@isovalent.com>
Date: Wed, 10 Nov 2021 11:46:27 +0000
Subject: bpftool: Fix memory leak in prog_dump()

Following the extraction of prog_dump() from do_dump(), the struct btf
allocated in prog_dump() is no longer freed on error; the struct
bpf_prog_linfo is not freed at all. Make sure we release them before
exiting the function.

Fixes: ec2025095cf6 ("bpftool: Match several programs with same tag")
Signed-off-by: Quentin Monnet <quentin@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211110114632.24537-2-quentin@isovalent.com
---
 tools/bpf/bpftool/prog.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index bf85c914f2fa..e47e8b06cc3d 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -709,8 +709,8 @@ prog_dump(struct bpf_prog_info *info, enum dump_mode mode,
 	char func_sig[1024];
 	unsigned char *buf;
 	__u32 member_len;
+	int fd, err = -1;
 	ssize_t n;
-	int fd;
 
 	if (mode == DUMP_JITED) {
 		if (info->jited_prog_len == 0 || !info->jited_prog_insns) {
@@ -749,7 +749,7 @@ prog_dump(struct bpf_prog_info *info, enum dump_mode mode,
 		if (fd < 0) {
 			p_err("can't open file %s: %s", filepath,
 			      strerror(errno));
-			return -1;
+			goto exit_free;
 		}
 
 		n = write(fd, buf, member_len);
@@ -757,7 +757,7 @@ prog_dump(struct bpf_prog_info *info, enum dump_mode mode,
 		if (n != (ssize_t)member_len) {
 			p_err("error writing output file: %s",
 			      n < 0 ? strerror(errno) : "short write");
-			return -1;
+			goto exit_free;
 		}
 
 		if (json_output)
@@ -771,7 +771,7 @@ prog_dump(struct bpf_prog_info *info, enum dump_mode mode,
 						     info->netns_ino,
 						     &disasm_opt);
 			if (!name)
-				return -1;
+				goto exit_free;
 		}
 
 		if (info->nr_jited_func_lens && info->jited_func_lens) {
@@ -866,9 +866,12 @@ prog_dump(struct bpf_prog_info *info, enum dump_mode mode,
 		kernel_syms_destroy(&dd);
 	}
 
-	btf__free(btf);
+	err = 0;
 
-	return 0;
+exit_free:
+	btf__free(btf);
+	bpf_prog_linfo__free(prog_linfo);
+	return err;
 }
 
 static int do_dump(int argc, char **argv)
-- 
cgit v1.2.3


From 48f5aef4c458c19ab337eed8c95a6486cc014aa3 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin@isovalent.com>
Date: Wed, 10 Nov 2021 11:46:28 +0000
Subject: bpftool: Remove inclusion of utilities.mak from Makefiles

Bpftool's Makefile, and the Makefile for its documentation, both include
scripts/utilities.mak, but they use none of the items defined in this
file. Remove the includes.

Fixes: 71bb428fe2c1 ("tools: bpf: add bpftool")
Signed-off-by: Quentin Monnet <quentin@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211110114632.24537-3-quentin@isovalent.com
---
 tools/bpf/bpftool/Documentation/Makefile | 1 -
 tools/bpf/bpftool/Makefile               | 1 -
 2 files changed, 2 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/Documentation/Makefile b/tools/bpf/bpftool/Documentation/Makefile
index 44b60784847b..692e1b947490 100644
--- a/tools/bpf/bpftool/Documentation/Makefile
+++ b/tools/bpf/bpftool/Documentation/Makefile
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
 include ../../../scripts/Makefile.include
-include ../../../scripts/utilities.mak
 
 INSTALL ?= install
 RM ?= rm -f
diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index 31dfef6a4121..1b36c11a2a26 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
 include ../../scripts/Makefile.include
-include ../../scripts/utilities.mak
 
 ifeq ($(srctree),)
 srctree := $(patsubst %/,%,$(dir $(CURDIR)))
-- 
cgit v1.2.3


From 986dec18bbf41f50edc2e0aa4ac5ef8e0f64f328 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin@isovalent.com>
Date: Wed, 10 Nov 2021 11:46:30 +0000
Subject: bpftool: Fix indent in option lists in the documentation

Mixed indentation levels in the lists of options in bpftool's
documentation produces some unexpected results. For the "bpftool" man
page, it prints a warning:

    $ make -C bpftool.8
      GEN     bpftool.8
    <stdin>:26: (ERROR/3) Unexpected indentation.

For other pages, there is no warning, but it results in a line break
appearing in the option lists in the generated man pages.

RST paragraphs should have a uniform indentation level. Let's fix it.

Fixes: c07ba629df97 ("tools: bpftool: Update and synchronise option list in doc and help msg")
Fixes: 8cc8c6357c8f ("tools: bpftool: Document and add bash completion for -L, -B options")
Signed-off-by: Quentin Monnet <quentin@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211110114632.24537-5-quentin@isovalent.com
---
 tools/bpf/bpftool/Documentation/bpftool-btf.rst    | 2 +-
 tools/bpf/bpftool/Documentation/bpftool-cgroup.rst | 2 +-
 tools/bpf/bpftool/Documentation/bpftool-gen.rst    | 2 +-
 tools/bpf/bpftool/Documentation/bpftool-link.rst   | 2 +-
 tools/bpf/bpftool/Documentation/bpftool-map.rst    | 6 +++---
 tools/bpf/bpftool/Documentation/bpftool-prog.rst   | 8 ++++----
 tools/bpf/bpftool/Documentation/bpftool.rst        | 6 +++---
 7 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
index 88b28aa7431f..4425d942dd39 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
@@ -13,7 +13,7 @@ SYNOPSIS
 	**bpftool** [*OPTIONS*] **btf** *COMMAND*
 
 	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | {**-d** | **--debug** } |
-		{ **-B** | **--base-btf** } }
+	{ **-B** | **--base-btf** } }
 
 	*COMMANDS* := { **dump** | **help** }
 
diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
index 3e4395eede4f..13a217a2503d 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
@@ -13,7 +13,7 @@ SYNOPSIS
 	**bpftool** [*OPTIONS*] **cgroup** *COMMAND*
 
 	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } |
-		{ **-f** | **--bpffs** } }
+	{ **-f** | **--bpffs** } }
 
 	*COMMANDS* :=
 	{ **show** | **list** | **tree** | **attach** | **detach** | **help** }
diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst
index 2ef2f2df0279..2a137f8a4cea 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst
@@ -13,7 +13,7 @@ SYNOPSIS
 	**bpftool** [*OPTIONS*] **gen** *COMMAND*
 
 	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } |
-		{ **-L** | **--use-loader** } }
+	{ **-L** | **--use-loader** } }
 
 	*COMMAND* := { **object** | **skeleton** | **help** }
 
diff --git a/tools/bpf/bpftool/Documentation/bpftool-link.rst b/tools/bpf/bpftool/Documentation/bpftool-link.rst
index 0de90f086238..9434349636a5 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-link.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-link.rst
@@ -13,7 +13,7 @@ SYNOPSIS
 	**bpftool** [*OPTIONS*] **link** *COMMAND*
 
 	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } |
-		{ **-f** | **--bpffs** } | { **-n** | **--nomount** } }
+	{ **-f** | **--bpffs** } | { **-n** | **--nomount** } }
 
 	*COMMANDS* := { **show** | **list** | **pin** | **help** }
 
diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst
index d0c4abe08aba..1445cadc15d4 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
@@ -13,11 +13,11 @@ SYNOPSIS
 	**bpftool** [*OPTIONS*] **map** *COMMAND*
 
 	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } |
-		{ **-f** | **--bpffs** } | { **-n** | **--nomount** } }
+	{ **-f** | **--bpffs** } | { **-n** | **--nomount** } }
 
 	*COMMANDS* :=
-	{ **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext**
-	| **delete** | **pin** | **help** }
+	{ **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** |
+	**delete** | **pin** | **help** }
 
 MAP COMMANDS
 =============
diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index 91608cb7e44a..f27265bd589b 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -13,12 +13,12 @@ SYNOPSIS
 	**bpftool** [*OPTIONS*] **prog** *COMMAND*
 
 	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } |
-		{ **-f** | **--bpffs** } | { **-m** | **--mapcompat** } | { **-n** | **--nomount** } |
-		{ **-L** | **--use-loader** } }
+	{ **-f** | **--bpffs** } | { **-m** | **--mapcompat** } | { **-n** | **--nomount** } |
+	{ **-L** | **--use-loader** } }
 
 	*COMMANDS* :=
-	{ **show** | **list** | **dump xlated** | **dump jited** | **pin** | **load**
-	| **loadall** | **help** }
+	{ **show** | **list** | **dump xlated** | **dump jited** | **pin** | **load** |
+	**loadall** | **help** }
 
 PROG COMMANDS
 =============
diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst
index bb23f55bb05a..8ac86565c501 100644
--- a/tools/bpf/bpftool/Documentation/bpftool.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool.rst
@@ -19,14 +19,14 @@ SYNOPSIS
 	*OBJECT* := { **map** | **program** | **cgroup** | **perf** | **net** | **feature** }
 
 	*OPTIONS* := { { **-V** | **--version** } |
-		{ **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } }
+	{ **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } }
 
 	*MAP-COMMANDS* :=
 	{ **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** |
-		**delete** | **pin** | **event_pipe** | **help** }
+	**delete** | **pin** | **event_pipe** | **help** }
 
 	*PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin** |
-		**load** | **attach** | **detach** | **help** }
+	**load** | **attach** | **detach** | **help** }
 
 	*CGROUP-COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** }
 
-- 
cgit v1.2.3


From 3811e2753a39efb8aa5b8c133dc24f6d26f6cd96 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin@isovalent.com>
Date: Wed, 10 Nov 2021 11:46:31 +0000
Subject: bpftool: Update the lists of names for maps and prog-attach types

To support the different BPF map or attach types, bpftool must remain
up-to-date with the types supported by the kernel. Let's update the
lists, by adding the missing Bloom filter map type and the perf_event
attach type.

Both missing items were found with test_bpftool_synctypes.py.

Signed-off-by: Quentin Monnet <quentin@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211110114632.24537-6-quentin@isovalent.com
---
 tools/bpf/bpftool/Documentation/bpftool-map.rst | 2 +-
 tools/bpf/bpftool/bash-completion/bpftool       | 3 ++-
 tools/bpf/bpftool/common.c                      | 1 +
 tools/bpf/bpftool/map.c                         | 3 ++-
 4 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst
index 1445cadc15d4..991d18fd84f2 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
@@ -52,7 +52,7 @@ MAP COMMANDS
 |		| **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash**
 |		| **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage**
 |		| **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage**
-		| **task_storage** }
+|		| **task_storage** | **bloom_filter** }
 
 DESCRIPTION
 ===========
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index 4a1b02ff72c1..493753a4962e 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -710,7 +710,8 @@ _bpftool()
                                 hash_of_maps devmap devmap_hash sockmap cpumap \
                                 xskmap sockhash cgroup_storage reuseport_sockarray \
                                 percpu_cgroup_storage queue stack sk_storage \
-                                struct_ops inode_storage task_storage ringbuf'
+                                struct_ops ringbuf inode_storage task_storage \
+                                bloom_filter'
                             COMPREPLY=( $( compgen -W "$BPFTOOL_MAP_CREATE_TYPES" -- "$cur" ) )
                             return 0
                             ;;
diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
index 511eccdbdfe6..fa8eb8134344 100644
--- a/tools/bpf/bpftool/common.c
+++ b/tools/bpf/bpftool/common.c
@@ -74,6 +74,7 @@ const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE] = {
 	[BPF_XDP]			= "xdp",
 	[BPF_SK_REUSEPORT_SELECT]	= "sk_skb_reuseport_select",
 	[BPF_SK_REUSEPORT_SELECT_OR_MIGRATE]	= "sk_skb_reuseport_select_or_migrate",
+	[BPF_PERF_EVENT]		= "perf_event",
 };
 
 void p_err(const char *fmt, ...)
diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index cae1f1119296..68cb121e65c4 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -53,6 +53,7 @@ const char * const map_type_name[] = {
 	[BPF_MAP_TYPE_RINGBUF]			= "ringbuf",
 	[BPF_MAP_TYPE_INODE_STORAGE]		= "inode_storage",
 	[BPF_MAP_TYPE_TASK_STORAGE]		= "task_storage",
+	[BPF_MAP_TYPE_BLOOM_FILTER]		= "bloom_filter",
 };
 
 const size_t map_type_name_size = ARRAY_SIZE(map_type_name);
@@ -1477,7 +1478,7 @@ static int do_help(int argc, char **argv)
 		"                 devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n"
 		"                 cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n"
 		"                 queue | stack | sk_storage | struct_ops | ringbuf | inode_storage |\n"
-		"                 task_storage }\n"
+		"                 task_storage | bloom_filter }\n"
 		"       " HELP_SPEC_OPTIONS " |\n"
 		"                    {-f|--bpffs} | {-n|--nomount} }\n"
 		"",
-- 
cgit v1.2.3


From b06be5651f08bc5bc305e2a3c722ddb33b783ee5 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin@isovalent.com>
Date: Wed, 10 Nov 2021 11:46:32 +0000
Subject: bpftool: Fix mixed indentation in documentation

Some paragraphs in bpftool's documentation have a mix of tabs and spaces
for indentation. Let's make it consistent.

This patch brings no change to the text content.

Signed-off-by: Quentin Monnet <quentin@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211110114632.24537-7-quentin@isovalent.com
---
 tools/bpf/bpftool/Documentation/bpftool-cgroup.rst | 10 ++--
 tools/bpf/bpftool/Documentation/bpftool-net.rst    | 66 +++++++++++-----------
 2 files changed, 38 insertions(+), 38 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
index 13a217a2503d..8069d37dd991 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
@@ -30,9 +30,9 @@ CGROUP COMMANDS
 |	*PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
 |	*ATTACH_TYPE* := { **ingress** | **egress** | **sock_create** | **sock_ops** | **device** |
 |		**bind4** | **bind6** | **post_bind4** | **post_bind6** | **connect4** | **connect6** |
-|               **getpeername4** | **getpeername6** | **getsockname4** | **getsockname6** | **sendmsg4** |
-|               **sendmsg6** | **recvmsg4** | **recvmsg6** | **sysctl** | **getsockopt** | **setsockopt** |
-|               **sock_release** }
+|		**getpeername4** | **getpeername6** | **getsockname4** | **getsockname6** | **sendmsg4** |
+|		**sendmsg6** | **recvmsg4** | **recvmsg6** | **sysctl** | **getsockopt** | **setsockopt** |
+|		**sock_release** }
 |	*ATTACH_FLAGS* := { **multi** | **override** }
 
 DESCRIPTION
@@ -98,9 +98,9 @@ DESCRIPTION
 		  **sendmsg6** call to sendto(2), sendmsg(2), sendmmsg(2) for an
 		  unconnected udp6 socket (since 4.18);
 		  **recvmsg4** call to recvfrom(2), recvmsg(2), recvmmsg(2) for
-                  an unconnected udp4 socket (since 5.2);
+		  an unconnected udp4 socket (since 5.2);
 		  **recvmsg6** call to recvfrom(2), recvmsg(2), recvmmsg(2) for
-                  an unconnected udp6 socket (since 5.2);
+		  an unconnected udp6 socket (since 5.2);
 		  **sysctl** sysctl access (since 5.2);
 		  **getsockopt** call to getsockopt (since 5.3);
 		  **setsockopt** call to setsockopt (since 5.3);
diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst
index 1ae0375e8fea..7ec57535a7c1 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-net.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst
@@ -31,44 +31,44 @@ NET COMMANDS
 DESCRIPTION
 ===========
 	**bpftool net { show | list }** [ **dev** *NAME* ]
-                  List bpf program attachments in the kernel networking subsystem.
-
-                  Currently, only device driver xdp attachments and tc filter
-                  classification/action attachments are implemented, i.e., for
-                  program types **BPF_PROG_TYPE_SCHED_CLS**,
-                  **BPF_PROG_TYPE_SCHED_ACT** and **BPF_PROG_TYPE_XDP**.
-                  For programs attached to a particular cgroup, e.g.,
-                  **BPF_PROG_TYPE_CGROUP_SKB**, **BPF_PROG_TYPE_CGROUP_SOCK**,
-                  **BPF_PROG_TYPE_SOCK_OPS** and **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**,
-                  users can use **bpftool cgroup** to dump cgroup attachments.
-                  For sk_{filter, skb, msg, reuseport} and lwt/seg6
-                  bpf programs, users should consult other tools, e.g., iproute2.
-
-                  The current output will start with all xdp program attachments, followed by
-                  all tc class/qdisc bpf program attachments. Both xdp programs and
-                  tc programs are ordered based on ifindex number. If multiple bpf
-                  programs attached to the same networking device through **tc filter**,
-                  the order will be first all bpf programs attached to tc classes, then
-                  all bpf programs attached to non clsact qdiscs, and finally all
-                  bpf programs attached to root and clsact qdisc.
+		  List bpf program attachments in the kernel networking subsystem.
+
+		  Currently, only device driver xdp attachments and tc filter
+		  classification/action attachments are implemented, i.e., for
+		  program types **BPF_PROG_TYPE_SCHED_CLS**,
+		  **BPF_PROG_TYPE_SCHED_ACT** and **BPF_PROG_TYPE_XDP**.
+		  For programs attached to a particular cgroup, e.g.,
+		  **BPF_PROG_TYPE_CGROUP_SKB**, **BPF_PROG_TYPE_CGROUP_SOCK**,
+		  **BPF_PROG_TYPE_SOCK_OPS** and **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**,
+		  users can use **bpftool cgroup** to dump cgroup attachments.
+		  For sk_{filter, skb, msg, reuseport} and lwt/seg6
+		  bpf programs, users should consult other tools, e.g., iproute2.
+
+		  The current output will start with all xdp program attachments, followed by
+		  all tc class/qdisc bpf program attachments. Both xdp programs and
+		  tc programs are ordered based on ifindex number. If multiple bpf
+		  programs attached to the same networking device through **tc filter**,
+		  the order will be first all bpf programs attached to tc classes, then
+		  all bpf programs attached to non clsact qdiscs, and finally all
+		  bpf programs attached to root and clsact qdisc.
 
 	**bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ]
-                  Attach bpf program *PROG* to network interface *NAME* with
-                  type specified by *ATTACH_TYPE*. Previously attached bpf program
-                  can be replaced by the command used with **overwrite** option.
-                  Currently, only XDP-related modes are supported for *ATTACH_TYPE*.
+		  Attach bpf program *PROG* to network interface *NAME* with
+		  type specified by *ATTACH_TYPE*. Previously attached bpf program
+		  can be replaced by the command used with **overwrite** option.
+		  Currently, only XDP-related modes are supported for *ATTACH_TYPE*.
 
-                  *ATTACH_TYPE* can be of:
-                  **xdp** - try native XDP and fallback to generic XDP if NIC driver does not support it;
-                  **xdpgeneric** - Generic XDP. runs at generic XDP hook when packet already enters receive path as skb;
-                  **xdpdrv** - Native XDP. runs earliest point in driver's receive path;
-                  **xdpoffload** - Offload XDP. runs directly on NIC on each packet reception;
+		  *ATTACH_TYPE* can be of:
+		  **xdp** - try native XDP and fallback to generic XDP if NIC driver does not support it;
+		  **xdpgeneric** - Generic XDP. runs at generic XDP hook when packet already enters receive path as skb;
+		  **xdpdrv** - Native XDP. runs earliest point in driver's receive path;
+		  **xdpoffload** - Offload XDP. runs directly on NIC on each packet reception;
 
 	**bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME*
-                  Detach bpf program attached to network interface *NAME* with
-                  type specified by *ATTACH_TYPE*. To detach bpf program, same
-                  *ATTACH_TYPE* previously used for attach must be specified.
-                  Currently, only XDP-related modes are supported for *ATTACH_TYPE*.
+		  Detach bpf program attached to network interface *NAME* with
+		  type specified by *ATTACH_TYPE*. To detach bpf program, same
+		  *ATTACH_TYPE* previously used for attach must be specified.
+		  Currently, only XDP-related modes are supported for *ATTACH_TYPE*.
 
 	**bpftool net help**
 		  Print short help message.
-- 
cgit v1.2.3


From e5043894b21f7d99d3db31ad06308d6c5726caa6 Mon Sep 17 00:00:00 2001
From: Hengqi Chen <hengqi.chen@gmail.com>
Date: Mon, 15 Nov 2021 09:24:36 +0800
Subject: bpftool: Use libbpf_get_error() to check error

Currently, LIBBPF_STRICT_ALL mode is enabled by default for
bpftool which means on error cases, some libbpf APIs would
return NULL pointers. This makes IS_ERR check failed to detect
such cases and result in segfault error. Use libbpf_get_error()
instead like we do in libbpf itself.

Signed-off-by: Hengqi Chen <hengqi.chen@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211115012436.3143318-1-hengqi.chen@gmail.com
---
 tools/bpf/bpftool/btf.c        |  9 +++++----
 tools/bpf/bpftool/gen.c        | 10 ++++++----
 tools/bpf/bpftool/iter.c       |  7 ++++---
 tools/bpf/bpftool/map.c        | 10 +++++-----
 tools/bpf/bpftool/struct_ops.c | 14 +++++++-------
 5 files changed, 27 insertions(+), 23 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c
index c7e3b0b0029e..59833125ac0a 100644
--- a/tools/bpf/bpftool/btf.c
+++ b/tools/bpf/bpftool/btf.c
@@ -421,8 +421,9 @@ static int dump_btf_c(const struct btf *btf,
 	int err = 0, i;
 
 	d = btf_dump__new(btf, btf_dump_printf, NULL, NULL);
-	if (IS_ERR(d))
-		return PTR_ERR(d);
+	err = libbpf_get_error(d);
+	if (err)
+		return err;
 
 	printf("#ifndef __VMLINUX_H__\n");
 	printf("#define __VMLINUX_H__\n");
@@ -549,8 +550,8 @@ static int do_dump(int argc, char **argv)
 		}
 
 		btf = btf__parse_split(*argv, base ?: base_btf);
-		if (IS_ERR(btf)) {
-			err = -PTR_ERR(btf);
+		err = libbpf_get_error(btf);
+		if (err) {
 			btf = NULL;
 			p_err("failed to load BTF from %s: %s",
 			      *argv, strerror(err));
diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c
index 89f0e828bbfa..997a2865e04a 100644
--- a/tools/bpf/bpftool/gen.c
+++ b/tools/bpf/bpftool/gen.c
@@ -219,8 +219,9 @@ static int codegen_datasecs(struct bpf_object *obj, const char *obj_name)
 	int i, err = 0;
 
 	d = btf_dump__new(btf, codegen_btf_dump_printf, NULL, NULL);
-	if (IS_ERR(d))
-		return PTR_ERR(d);
+	err = libbpf_get_error(d);
+	if (err)
+		return err;
 
 	bpf_object__for_each_map(map, obj) {
 		/* only generate definitions for memory-mapped internal maps */
@@ -719,10 +720,11 @@ static int do_skeleton(int argc, char **argv)
 		get_obj_name(obj_name, file);
 	opts.object_name = obj_name;
 	obj = bpf_object__open_mem(obj_data, file_sz, &opts);
-	if (IS_ERR(obj)) {
+	err = libbpf_get_error(obj);
+	if (err) {
 		char err_buf[256];
 
-		libbpf_strerror(PTR_ERR(obj), err_buf, sizeof(err_buf));
+		libbpf_strerror(err, err_buf, sizeof(err_buf));
 		p_err("failed to open BPF object file: %s", err_buf);
 		obj = NULL;
 		goto out;
diff --git a/tools/bpf/bpftool/iter.c b/tools/bpf/bpftool/iter.c
index 6c0de647b8ad..f88fdc820d23 100644
--- a/tools/bpf/bpftool/iter.c
+++ b/tools/bpf/bpftool/iter.c
@@ -46,7 +46,8 @@ static int do_pin(int argc, char **argv)
 	}
 
 	obj = bpf_object__open(objfile);
-	if (IS_ERR(obj)) {
+	err = libbpf_get_error(obj);
+	if (err) {
 		p_err("can't open objfile %s", objfile);
 		goto close_map_fd;
 	}
@@ -64,8 +65,8 @@ static int do_pin(int argc, char **argv)
 	}
 
 	link = bpf_program__attach_iter(prog, &iter_opts);
-	if (IS_ERR(link)) {
-		err = PTR_ERR(link);
+	err = libbpf_get_error(link);
+	if (err) {
 		p_err("attach_iter failed for program %s",
 		      bpf_program__name(prog));
 		goto close_obj;
diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index 68cb121e65c4..25b258804f11 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -812,7 +812,7 @@ static struct btf *get_map_kv_btf(const struct bpf_map_info *info)
 	if (info->btf_vmlinux_value_type_id) {
 		if (!btf_vmlinux) {
 			btf_vmlinux = libbpf_find_kernel_btf();
-			if (IS_ERR(btf_vmlinux))
+			if (libbpf_get_error(btf_vmlinux))
 				p_err("failed to get kernel btf");
 		}
 		return btf_vmlinux;
@@ -832,13 +832,13 @@ static struct btf *get_map_kv_btf(const struct bpf_map_info *info)
 
 static void free_map_kv_btf(struct btf *btf)
 {
-	if (!IS_ERR(btf) && btf != btf_vmlinux)
+	if (!libbpf_get_error(btf) && btf != btf_vmlinux)
 		btf__free(btf);
 }
 
 static void free_btf_vmlinux(void)
 {
-	if (!IS_ERR(btf_vmlinux))
+	if (!libbpf_get_error(btf_vmlinux))
 		btf__free(btf_vmlinux);
 }
 
@@ -863,8 +863,8 @@ map_dump(int fd, struct bpf_map_info *info, json_writer_t *wtr,
 
 	if (wtr) {
 		btf = get_map_kv_btf(info);
-		if (IS_ERR(btf)) {
-			err = PTR_ERR(btf);
+		err = libbpf_get_error(btf);
+		if (err) {
 			goto exit_free;
 		}
 
diff --git a/tools/bpf/bpftool/struct_ops.c b/tools/bpf/bpftool/struct_ops.c
index 20f803dce2e4..cbdca37a53f0 100644
--- a/tools/bpf/bpftool/struct_ops.c
+++ b/tools/bpf/bpftool/struct_ops.c
@@ -32,7 +32,7 @@ static const struct btf *get_btf_vmlinux(void)
 		return btf_vmlinux;
 
 	btf_vmlinux = libbpf_find_kernel_btf();
-	if (IS_ERR(btf_vmlinux))
+	if (libbpf_get_error(btf_vmlinux))
 		p_err("struct_ops requires kernel CONFIG_DEBUG_INFO_BTF=y");
 
 	return btf_vmlinux;
@@ -45,7 +45,7 @@ static const char *get_kern_struct_ops_name(const struct bpf_map_info *info)
 	const char *st_ops_name;
 
 	kern_btf = get_btf_vmlinux();
-	if (IS_ERR(kern_btf))
+	if (libbpf_get_error(kern_btf))
 		return "<btf_vmlinux_not_found>";
 
 	t = btf__type_by_id(kern_btf, info->btf_vmlinux_value_type_id);
@@ -63,7 +63,7 @@ static __s32 get_map_info_type_id(void)
 		return map_info_type_id;
 
 	kern_btf = get_btf_vmlinux();
-	if (IS_ERR(kern_btf)) {
+	if (libbpf_get_error(kern_btf)) {
 		map_info_type_id = PTR_ERR(kern_btf);
 		return map_info_type_id;
 	}
@@ -415,7 +415,7 @@ static int do_dump(int argc, char **argv)
 	}
 
 	kern_btf = get_btf_vmlinux();
-	if (IS_ERR(kern_btf))
+	if (libbpf_get_error(kern_btf))
 		return -1;
 
 	if (!json_output) {
@@ -495,7 +495,7 @@ static int do_register(int argc, char **argv)
 	file = GET_ARG();
 
 	obj = bpf_object__open(file);
-	if (IS_ERR_OR_NULL(obj))
+	if (libbpf_get_error(obj))
 		return -1;
 
 	set_max_rlimit();
@@ -516,7 +516,7 @@ static int do_register(int argc, char **argv)
 			continue;
 
 		link = bpf_map__attach_struct_ops(map);
-		if (IS_ERR(link)) {
+		if (libbpf_get_error(link)) {
 			p_err("can't register struct_ops %s: %s",
 			      bpf_map__name(map),
 			      strerror(-PTR_ERR(link)));
@@ -596,7 +596,7 @@ int do_struct_ops(int argc, char **argv)
 
 	err = cmd_select(cmds, argc, argv, do_help);
 
-	if (!IS_ERR(btf_vmlinux))
+	if (!libbpf_get_error(btf_vmlinux))
 		btf__free(btf_vmlinux);
 
 	return err;
-- 
cgit v1.2.3


From a2acf0c0e2da29950d0361a3b5ea05e8d0351dfe Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 11 Nov 2021 18:23:30 +0100
Subject: selftests: nft_nat: switch port shadow test cases to socat

There are now at least three distinct flavours of netcat/nc tool:
'original' version, one version ported from openbsd and nmap-ncat.

The script only works with original because it sets SOREUSEPORT option.

Other nc versions return 'port already in use' error and port shadow test fails:

PASS: inet IPv6 redirection for ns2-hMHcaRvx
nc: bind failed: Address already in use
ERROR: portshadow test default: got reply from "ROUTER", not CLIENT as intended

Switch to socat instead.

Reported-by: kernel test robot <oliver.sang@intel.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 tools/testing/selftests/netfilter/nft_nat.sh | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/netfilter/nft_nat.sh b/tools/testing/selftests/netfilter/nft_nat.sh
index c62e4e26252c..d88867d2fed7 100755
--- a/tools/testing/selftests/netfilter/nft_nat.sh
+++ b/tools/testing/selftests/netfilter/nft_nat.sh
@@ -760,20 +760,20 @@ test_port_shadow()
 	local logmsg=""
 
 	# make shadow entry, from client (ns2), going to (ns1), port 41404, sport 1405.
-	echo "fake-entry" | ip netns exec "$ns2" nc -w 1 -p 1405 -u "$daddrc" 41404 > /dev/null
+	echo "fake-entry" | ip netns exec "$ns2" timeout 1 socat -u STDIN UDP:"$daddrc":41404,sourceport=1405
 
-	echo ROUTER | ip netns exec "$ns0" nc -w 5 -u -l -p 1405 >/dev/null 2>&1 &
-	nc_r=$!
+	echo ROUTER | ip netns exec "$ns0" timeout 5 socat -u STDIN UDP4-LISTEN:1405 &
+	sc_r=$!
 
-	echo CLIENT | ip netns exec "$ns2" nc -w 5 -u -l -p 1405 >/dev/null 2>&1 &
-	nc_c=$!
+	echo CLIENT | ip netns exec "$ns2" timeout 5 socat -u STDIN UDP4-LISTEN:1405,reuseport &
+	sc_c=$!
 
 	sleep 0.3
 
 	# ns1 tries to connect to ns0:1405.  With default settings this should connect
 	# to client, it matches the conntrack entry created above.
 
-	result=$(echo "" | ip netns exec "$ns1" nc -w 1 -p 41404 -u "$daddrs" 1405)
+	result=$(echo "data" | ip netns exec "$ns1" timeout 1 socat - UDP:"$daddrs":1405,sourceport=41404)
 
 	if [ "$result" = "$expect" ] ;then
 		echo "PASS: portshadow test $test: got reply from ${expect}${logmsg}"
@@ -782,7 +782,7 @@ test_port_shadow()
 		ret=1
 	fi
 
-	kill $nc_r $nc_c 2>/dev/null
+	kill $sc_r $sc_c 2>/dev/null
 
 	# flush udp entries for next test round, if any
 	ip netns exec "$ns0" conntrack -F >/dev/null 2>&1
@@ -852,6 +852,18 @@ test_port_shadowing()
 {
 	local family="ip"
 
+	conntrack -h >/dev/null 2>&1
+	if [ $? -ne 0 ];then
+		echo "SKIP: Could not run nat port shadowing test without conntrack tool"
+		return
+	fi
+
+	socat -h > /dev/null 2>&1
+	if [ $? -ne 0 ];then
+		echo "SKIP: Could not run nat port shadowing test without socat tool"
+		return
+	fi
+
 	ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
 	ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
 
-- 
cgit v1.2.3


From 5e2411ae8071612396cd25017e29f9ce0662a5ff Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Tue, 2 Nov 2021 13:29:01 -0700
Subject: cxl/memdev: Change cxl_mem to a more descriptive name

The 'struct cxl_mem' object actually represents the state of a CXL
device within the driver. Comments indicating that 'struct cxl_mem' is a
device itself are incorrect. It is data layered on top of a CXL Memory
Expander class device. Rename it 'struct cxl_dev_state'. The 'struct'
cxl_memdev' structure represents a Linux CXL memory device object, and
it uses services and information provided by 'struct cxl_dev_state'.

Update the structure name, function names, and the kdocs to reflect the
real uses of this structure.

Some helper functions that were previously prefixed "cxl_mem_" are
renamed to just "cxl_".

Acked-by: Ben Widawsky <ben.widawsky@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Link: https://lore.kernel.org/r/20211102202901.3675568-3-ira.weiny@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/core/mbox.c      | 182 +++++++++++++++++++++----------------------
 drivers/cxl/core/memdev.c    |  49 ++++++------
 drivers/cxl/cxlmem.h         |  35 +++++----
 drivers/cxl/pci.c            | 120 ++++++++++++++--------------
 drivers/cxl/pmem.c           |  39 +++++-----
 tools/testing/cxl/test/mem.c |  50 ++++++------
 6 files changed, 239 insertions(+), 236 deletions(-)

(limited to 'tools')

diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 79bb64218339..0806ba8d38cb 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -128,8 +128,8 @@ static struct cxl_mem_command *cxl_mem_find_command(u16 opcode)
 }
 
 /**
- * cxl_mem_mbox_send_cmd() - Send a mailbox command to a memory device.
- * @cxlm: The CXL memory device to communicate with.
+ * cxl_mbox_send_cmd() - Send a mailbox command to a device.
+ * @cxlds: The device data for the operation
  * @opcode: Opcode for the mailbox command.
  * @in: The input payload for the mailbox command.
  * @in_size: The length of the input payload
@@ -149,8 +149,8 @@ static struct cxl_mem_command *cxl_mem_find_command(u16 opcode)
  * error. While this distinction can be useful for commands from userspace, the
  * kernel will only be able to use results when both are successful.
  */
-int cxl_mem_mbox_send_cmd(struct cxl_mem *cxlm, u16 opcode, void *in,
-			  size_t in_size, void *out, size_t out_size)
+int cxl_mbox_send_cmd(struct cxl_dev_state *cxlds, u16 opcode, void *in,
+		      size_t in_size, void *out, size_t out_size)
 {
 	const struct cxl_mem_command *cmd = cxl_mem_find_command(opcode);
 	struct cxl_mbox_cmd mbox_cmd = {
@@ -162,10 +162,10 @@ int cxl_mem_mbox_send_cmd(struct cxl_mem *cxlm, u16 opcode, void *in,
 	};
 	int rc;
 
-	if (out_size > cxlm->payload_size)
+	if (out_size > cxlds->payload_size)
 		return -E2BIG;
 
-	rc = cxlm->mbox_send(cxlm, &mbox_cmd);
+	rc = cxlds->mbox_send(cxlds, &mbox_cmd);
 	if (rc)
 		return rc;
 
@@ -182,7 +182,7 @@ int cxl_mem_mbox_send_cmd(struct cxl_mem *cxlm, u16 opcode, void *in,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(cxl_mem_mbox_send_cmd);
+EXPORT_SYMBOL_GPL(cxl_mbox_send_cmd);
 
 static bool cxl_mem_raw_command_allowed(u16 opcode)
 {
@@ -209,7 +209,7 @@ static bool cxl_mem_raw_command_allowed(u16 opcode)
 
 /**
  * cxl_validate_cmd_from_user() - Check fields for CXL_MEM_SEND_COMMAND.
- * @cxlm: &struct cxl_mem device whose mailbox will be used.
+ * @cxlds: The device data for the operation
  * @send_cmd: &struct cxl_send_command copied in from userspace.
  * @out_cmd: Sanitized and populated &struct cxl_mem_command.
  *
@@ -226,7 +226,7 @@ static bool cxl_mem_raw_command_allowed(u16 opcode)
  *
  * See handle_mailbox_cmd_from_user()
  */
-static int cxl_validate_cmd_from_user(struct cxl_mem *cxlm,
+static int cxl_validate_cmd_from_user(struct cxl_dev_state *cxlds,
 				      const struct cxl_send_command *send_cmd,
 				      struct cxl_mem_command *out_cmd)
 {
@@ -241,7 +241,7 @@ static int cxl_validate_cmd_from_user(struct cxl_mem *cxlm,
 	 * supports, but output can be arbitrarily large (simply write out as
 	 * much data as the hardware provides).
 	 */
-	if (send_cmd->in.size > cxlm->payload_size)
+	if (send_cmd->in.size > cxlds->payload_size)
 		return -EINVAL;
 
 	/*
@@ -267,7 +267,7 @@ static int cxl_validate_cmd_from_user(struct cxl_mem *cxlm,
 		 * gets passed along without further checking, so it must be
 		 * validated here.
 		 */
-		if (send_cmd->out.size > cxlm->payload_size)
+		if (send_cmd->out.size > cxlds->payload_size)
 			return -EINVAL;
 
 		if (!cxl_mem_raw_command_allowed(send_cmd->raw.opcode))
@@ -292,11 +292,11 @@ static int cxl_validate_cmd_from_user(struct cxl_mem *cxlm,
 	info = &c->info;
 
 	/* Check that the command is enabled for hardware */
-	if (!test_bit(info->id, cxlm->enabled_cmds))
+	if (!test_bit(info->id, cxlds->enabled_cmds))
 		return -ENOTTY;
 
 	/* Check that the command is not claimed for exclusive kernel use */
-	if (test_bit(info->id, cxlm->exclusive_cmds))
+	if (test_bit(info->id, cxlds->exclusive_cmds))
 		return -EBUSY;
 
 	/* Check the input buffer is the expected size */
@@ -354,7 +354,7 @@ int cxl_query_cmd(struct cxl_memdev *cxlmd,
 
 /**
  * handle_mailbox_cmd_from_user() - Dispatch a mailbox command for userspace.
- * @cxlm: The CXL memory device to communicate with.
+ * @cxlds: The device data for the operation
  * @cmd: The validated command.
  * @in_payload: Pointer to userspace's input payload.
  * @out_payload: Pointer to userspace's output payload.
@@ -377,12 +377,12 @@ int cxl_query_cmd(struct cxl_memdev *cxlmd,
  *
  * See cxl_send_cmd().
  */
-static int handle_mailbox_cmd_from_user(struct cxl_mem *cxlm,
+static int handle_mailbox_cmd_from_user(struct cxl_dev_state *cxlds,
 					const struct cxl_mem_command *cmd,
 					u64 in_payload, u64 out_payload,
 					s32 *size_out, u32 *retval)
 {
-	struct device *dev = cxlm->dev;
+	struct device *dev = cxlds->dev;
 	struct cxl_mbox_cmd mbox_cmd = {
 		.opcode = cmd->opcode,
 		.size_in = cmd->info.size_in,
@@ -415,7 +415,7 @@ static int handle_mailbox_cmd_from_user(struct cxl_mem *cxlm,
 	dev_WARN_ONCE(dev, cmd->info.id == CXL_MEM_COMMAND_ID_RAW,
 		      "raw command path used\n");
 
-	rc = cxlm->mbox_send(cxlm, &mbox_cmd);
+	rc = cxlds->mbox_send(cxlds, &mbox_cmd);
 	if (rc)
 		goto out;
 
@@ -445,7 +445,7 @@ out:
 
 int cxl_send_cmd(struct cxl_memdev *cxlmd, struct cxl_send_command __user *s)
 {
-	struct cxl_mem *cxlm = cxlmd->cxlm;
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
 	struct device *dev = &cxlmd->dev;
 	struct cxl_send_command send;
 	struct cxl_mem_command c;
@@ -456,15 +456,15 @@ int cxl_send_cmd(struct cxl_memdev *cxlmd, struct cxl_send_command __user *s)
 	if (copy_from_user(&send, s, sizeof(send)))
 		return -EFAULT;
 
-	rc = cxl_validate_cmd_from_user(cxlmd->cxlm, &send, &c);
+	rc = cxl_validate_cmd_from_user(cxlmd->cxlds, &send, &c);
 	if (rc)
 		return rc;
 
 	/* Prepare to handle a full payload for variable sized output */
 	if (c.info.size_out < 0)
-		c.info.size_out = cxlm->payload_size;
+		c.info.size_out = cxlds->payload_size;
 
-	rc = handle_mailbox_cmd_from_user(cxlm, &c, send.in.payload,
+	rc = handle_mailbox_cmd_from_user(cxlds, &c, send.in.payload,
 					  send.out.payload, &send.out.size,
 					  &send.retval);
 	if (rc)
@@ -476,13 +476,13 @@ int cxl_send_cmd(struct cxl_memdev *cxlmd, struct cxl_send_command __user *s)
 	return 0;
 }
 
-static int cxl_xfer_log(struct cxl_mem *cxlm, uuid_t *uuid, u32 size, u8 *out)
+static int cxl_xfer_log(struct cxl_dev_state *cxlds, uuid_t *uuid, u32 size, u8 *out)
 {
 	u32 remaining = size;
 	u32 offset = 0;
 
 	while (remaining) {
-		u32 xfer_size = min_t(u32, remaining, cxlm->payload_size);
+		u32 xfer_size = min_t(u32, remaining, cxlds->payload_size);
 		struct cxl_mbox_get_log log = {
 			.uuid = *uuid,
 			.offset = cpu_to_le32(offset),
@@ -490,8 +490,8 @@ static int cxl_xfer_log(struct cxl_mem *cxlm, uuid_t *uuid, u32 size, u8 *out)
 		};
 		int rc;
 
-		rc = cxl_mem_mbox_send_cmd(cxlm, CXL_MBOX_OP_GET_LOG, &log,
-					   sizeof(log), out, xfer_size);
+		rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_LOG, &log, sizeof(log),
+				       out, xfer_size);
 		if (rc < 0)
 			return rc;
 
@@ -505,14 +505,14 @@ static int cxl_xfer_log(struct cxl_mem *cxlm, uuid_t *uuid, u32 size, u8 *out)
 
 /**
  * cxl_walk_cel() - Walk through the Command Effects Log.
- * @cxlm: Device.
+ * @cxlds: The device data for the operation
  * @size: Length of the Command Effects Log.
  * @cel: CEL
  *
  * Iterate over each entry in the CEL and determine if the driver supports the
  * command. If so, the command is enabled for the device and can be used later.
  */
-static void cxl_walk_cel(struct cxl_mem *cxlm, size_t size, u8 *cel)
+static void cxl_walk_cel(struct cxl_dev_state *cxlds, size_t size, u8 *cel)
 {
 	struct cxl_cel_entry *cel_entry;
 	const int cel_entries = size / sizeof(*cel_entry);
@@ -525,26 +525,26 @@ static void cxl_walk_cel(struct cxl_mem *cxlm, size_t size, u8 *cel)
 		struct cxl_mem_command *cmd = cxl_mem_find_command(opcode);
 
 		if (!cmd) {
-			dev_dbg(cxlm->dev,
+			dev_dbg(cxlds->dev,
 				"Opcode 0x%04x unsupported by driver", opcode);
 			continue;
 		}
 
-		set_bit(cmd->info.id, cxlm->enabled_cmds);
+		set_bit(cmd->info.id, cxlds->enabled_cmds);
 	}
 }
 
-static struct cxl_mbox_get_supported_logs *cxl_get_gsl(struct cxl_mem *cxlm)
+static struct cxl_mbox_get_supported_logs *cxl_get_gsl(struct cxl_dev_state *cxlds)
 {
 	struct cxl_mbox_get_supported_logs *ret;
 	int rc;
 
-	ret = kvmalloc(cxlm->payload_size, GFP_KERNEL);
+	ret = kvmalloc(cxlds->payload_size, GFP_KERNEL);
 	if (!ret)
 		return ERR_PTR(-ENOMEM);
 
-	rc = cxl_mem_mbox_send_cmd(cxlm, CXL_MBOX_OP_GET_SUPPORTED_LOGS, NULL,
-				   0, ret, cxlm->payload_size);
+	rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_SUPPORTED_LOGS, NULL, 0, ret,
+			       cxlds->payload_size);
 	if (rc < 0) {
 		kvfree(ret);
 		return ERR_PTR(rc);
@@ -565,23 +565,23 @@ static const uuid_t log_uuid[] = {
 };
 
 /**
- * cxl_mem_enumerate_cmds() - Enumerate commands for a device.
- * @cxlm: The device.
+ * cxl_enumerate_cmds() - Enumerate commands for a device.
+ * @cxlds: The device data for the operation
  *
  * Returns 0 if enumerate completed successfully.
  *
  * CXL devices have optional support for certain commands. This function will
  * determine the set of supported commands for the hardware and update the
- * enabled_cmds bitmap in the @cxlm.
+ * enabled_cmds bitmap in the @cxlds.
  */
-int cxl_mem_enumerate_cmds(struct cxl_mem *cxlm)
+int cxl_enumerate_cmds(struct cxl_dev_state *cxlds)
 {
 	struct cxl_mbox_get_supported_logs *gsl;
-	struct device *dev = cxlm->dev;
+	struct device *dev = cxlds->dev;
 	struct cxl_mem_command *cmd;
 	int i, rc;
 
-	gsl = cxl_get_gsl(cxlm);
+	gsl = cxl_get_gsl(cxlds);
 	if (IS_ERR(gsl))
 		return PTR_ERR(gsl);
 
@@ -602,19 +602,19 @@ int cxl_mem_enumerate_cmds(struct cxl_mem *cxlm)
 			goto out;
 		}
 
-		rc = cxl_xfer_log(cxlm, &uuid, size, log);
+		rc = cxl_xfer_log(cxlds, &uuid, size, log);
 		if (rc) {
 			kvfree(log);
 			goto out;
 		}
 
-		cxl_walk_cel(cxlm, size, log);
+		cxl_walk_cel(cxlds, size, log);
 		kvfree(log);
 
 		/* In case CEL was bogus, enable some default commands. */
 		cxl_for_each_cmd(cmd)
 			if (cmd->flags & CXL_CMD_FLAG_FORCE_ENABLE)
-				set_bit(cmd->info.id, cxlm->enabled_cmds);
+				set_bit(cmd->info.id, cxlds->enabled_cmds);
 
 		/* Found the required CEL */
 		rc = 0;
@@ -624,11 +624,11 @@ out:
 	kvfree(gsl);
 	return rc;
 }
-EXPORT_SYMBOL_GPL(cxl_mem_enumerate_cmds);
+EXPORT_SYMBOL_GPL(cxl_enumerate_cmds);
 
 /**
  * cxl_mem_get_partition_info - Get partition info
- * @cxlm: cxl_mem instance to update partition info
+ * @cxlds: The device data for the operation
  *
  * Retrieve the current partition info for the device specified.  The active
  * values are the current capacity in bytes.  If not 0, the 'next' values are
@@ -638,7 +638,7 @@ EXPORT_SYMBOL_GPL(cxl_mem_enumerate_cmds);
  *
  * See CXL @8.2.9.5.2.1 Get Partition Info
  */
-static int cxl_mem_get_partition_info(struct cxl_mem *cxlm)
+static int cxl_mem_get_partition_info(struct cxl_dev_state *cxlds)
 {
 	struct cxl_mbox_get_partition_info {
 		__le64 active_volatile_cap;
@@ -648,124 +648,124 @@ static int cxl_mem_get_partition_info(struct cxl_mem *cxlm)
 	} __packed pi;
 	int rc;
 
-	rc = cxl_mem_mbox_send_cmd(cxlm, CXL_MBOX_OP_GET_PARTITION_INFO,
-				   NULL, 0, &pi, sizeof(pi));
+	rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_PARTITION_INFO, NULL, 0,
+			       &pi, sizeof(pi));
 
 	if (rc)
 		return rc;
 
-	cxlm->active_volatile_bytes =
+	cxlds->active_volatile_bytes =
 		le64_to_cpu(pi.active_volatile_cap) * CXL_CAPACITY_MULTIPLIER;
-	cxlm->active_persistent_bytes =
+	cxlds->active_persistent_bytes =
 		le64_to_cpu(pi.active_persistent_cap) * CXL_CAPACITY_MULTIPLIER;
-	cxlm->next_volatile_bytes =
+	cxlds->next_volatile_bytes =
 		le64_to_cpu(pi.next_volatile_cap) * CXL_CAPACITY_MULTIPLIER;
-	cxlm->next_persistent_bytes =
+	cxlds->next_persistent_bytes =
 		le64_to_cpu(pi.next_volatile_cap) * CXL_CAPACITY_MULTIPLIER;
 
 	return 0;
 }
 
 /**
- * cxl_mem_identify() - Send the IDENTIFY command to the device.
- * @cxlm: The device to identify.
+ * cxl_dev_state_identify() - Send the IDENTIFY command to the device.
+ * @cxlds: The device data for the operation
  *
  * Return: 0 if identify was executed successfully.
  *
  * This will dispatch the identify command to the device and on success populate
  * structures to be exported to sysfs.
  */
-int cxl_mem_identify(struct cxl_mem *cxlm)
+int cxl_dev_state_identify(struct cxl_dev_state *cxlds)
 {
 	/* See CXL 2.0 Table 175 Identify Memory Device Output Payload */
 	struct cxl_mbox_identify id;
 	int rc;
 
-	rc = cxl_mem_mbox_send_cmd(cxlm, CXL_MBOX_OP_IDENTIFY, NULL, 0, &id,
-				   sizeof(id));
+	rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_IDENTIFY, NULL, 0, &id,
+			       sizeof(id));
 	if (rc < 0)
 		return rc;
 
-	cxlm->total_bytes =
+	cxlds->total_bytes =
 		le64_to_cpu(id.total_capacity) * CXL_CAPACITY_MULTIPLIER;
-	cxlm->volatile_only_bytes =
+	cxlds->volatile_only_bytes =
 		le64_to_cpu(id.volatile_capacity) * CXL_CAPACITY_MULTIPLIER;
-	cxlm->persistent_only_bytes =
+	cxlds->persistent_only_bytes =
 		le64_to_cpu(id.persistent_capacity) * CXL_CAPACITY_MULTIPLIER;
-	cxlm->partition_align_bytes =
+	cxlds->partition_align_bytes =
 		le64_to_cpu(id.partition_align) * CXL_CAPACITY_MULTIPLIER;
 
-	dev_dbg(cxlm->dev,
+	dev_dbg(cxlds->dev,
 		"Identify Memory Device\n"
 		"     total_bytes = %#llx\n"
 		"     volatile_only_bytes = %#llx\n"
 		"     persistent_only_bytes = %#llx\n"
 		"     partition_align_bytes = %#llx\n",
-		cxlm->total_bytes, cxlm->volatile_only_bytes,
-		cxlm->persistent_only_bytes, cxlm->partition_align_bytes);
+		cxlds->total_bytes, cxlds->volatile_only_bytes,
+		cxlds->persistent_only_bytes, cxlds->partition_align_bytes);
 
-	cxlm->lsa_size = le32_to_cpu(id.lsa_size);
-	memcpy(cxlm->firmware_version, id.fw_revision, sizeof(id.fw_revision));
+	cxlds->lsa_size = le32_to_cpu(id.lsa_size);
+	memcpy(cxlds->firmware_version, id.fw_revision, sizeof(id.fw_revision));
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(cxl_mem_identify);
+EXPORT_SYMBOL_GPL(cxl_dev_state_identify);
 
-int cxl_mem_create_range_info(struct cxl_mem *cxlm)
+int cxl_mem_create_range_info(struct cxl_dev_state *cxlds)
 {
 	int rc;
 
-	if (cxlm->partition_align_bytes == 0) {
-		cxlm->ram_range.start = 0;
-		cxlm->ram_range.end = cxlm->volatile_only_bytes - 1;
-		cxlm->pmem_range.start = cxlm->volatile_only_bytes;
-		cxlm->pmem_range.end = cxlm->volatile_only_bytes +
-				       cxlm->persistent_only_bytes - 1;
+	if (cxlds->partition_align_bytes == 0) {
+		cxlds->ram_range.start = 0;
+		cxlds->ram_range.end = cxlds->volatile_only_bytes - 1;
+		cxlds->pmem_range.start = cxlds->volatile_only_bytes;
+		cxlds->pmem_range.end = cxlds->volatile_only_bytes +
+				       cxlds->persistent_only_bytes - 1;
 		return 0;
 	}
 
-	rc = cxl_mem_get_partition_info(cxlm);
+	rc = cxl_mem_get_partition_info(cxlds);
 	if (rc) {
-		dev_err(cxlm->dev, "Failed to query partition information\n");
+		dev_err(cxlds->dev, "Failed to query partition information\n");
 		return rc;
 	}
 
-	dev_dbg(cxlm->dev,
+	dev_dbg(cxlds->dev,
 		"Get Partition Info\n"
 		"     active_volatile_bytes = %#llx\n"
 		"     active_persistent_bytes = %#llx\n"
 		"     next_volatile_bytes = %#llx\n"
 		"     next_persistent_bytes = %#llx\n",
-		cxlm->active_volatile_bytes, cxlm->active_persistent_bytes,
-		cxlm->next_volatile_bytes, cxlm->next_persistent_bytes);
+		cxlds->active_volatile_bytes, cxlds->active_persistent_bytes,
+		cxlds->next_volatile_bytes, cxlds->next_persistent_bytes);
 
-	cxlm->ram_range.start = 0;
-	cxlm->ram_range.end = cxlm->active_volatile_bytes - 1;
+	cxlds->ram_range.start = 0;
+	cxlds->ram_range.end = cxlds->active_volatile_bytes - 1;
 
-	cxlm->pmem_range.start = cxlm->active_volatile_bytes;
-	cxlm->pmem_range.end =
-		cxlm->active_volatile_bytes + cxlm->active_persistent_bytes - 1;
+	cxlds->pmem_range.start = cxlds->active_volatile_bytes;
+	cxlds->pmem_range.end =
+		cxlds->active_volatile_bytes + cxlds->active_persistent_bytes - 1;
 
 	return 0;
 }
 EXPORT_SYMBOL_GPL(cxl_mem_create_range_info);
 
-struct cxl_mem *cxl_mem_create(struct device *dev)
+struct cxl_dev_state *cxl_dev_state_create(struct device *dev)
 {
-	struct cxl_mem *cxlm;
+	struct cxl_dev_state *cxlds;
 
-	cxlm = devm_kzalloc(dev, sizeof(*cxlm), GFP_KERNEL);
-	if (!cxlm) {
+	cxlds = devm_kzalloc(dev, sizeof(*cxlds), GFP_KERNEL);
+	if (!cxlds) {
 		dev_err(dev, "No memory available\n");
 		return ERR_PTR(-ENOMEM);
 	}
 
-	mutex_init(&cxlm->mbox_mutex);
-	cxlm->dev = dev;
+	mutex_init(&cxlds->mbox_mutex);
+	cxlds->dev = dev;
 
-	return cxlm;
+	return cxlds;
 }
-EXPORT_SYMBOL_GPL(cxl_mem_create);
+EXPORT_SYMBOL_GPL(cxl_dev_state_create);
 
 static struct dentry *cxl_debugfs;
 
diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
index bf1b04d00ff4..6d6c73802631 100644
--- a/drivers/cxl/core/memdev.c
+++ b/drivers/cxl/core/memdev.c
@@ -37,9 +37,9 @@ static ssize_t firmware_version_show(struct device *dev,
 				     struct device_attribute *attr, char *buf)
 {
 	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
-	struct cxl_mem *cxlm = cxlmd->cxlm;
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
 
-	return sysfs_emit(buf, "%.16s\n", cxlm->firmware_version);
+	return sysfs_emit(buf, "%.16s\n", cxlds->firmware_version);
 }
 static DEVICE_ATTR_RO(firmware_version);
 
@@ -47,9 +47,9 @@ static ssize_t payload_max_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
 	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
-	struct cxl_mem *cxlm = cxlmd->cxlm;
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
 
-	return sysfs_emit(buf, "%zu\n", cxlm->payload_size);
+	return sysfs_emit(buf, "%zu\n", cxlds->payload_size);
 }
 static DEVICE_ATTR_RO(payload_max);
 
@@ -57,9 +57,9 @@ static ssize_t label_storage_size_show(struct device *dev,
 				       struct device_attribute *attr, char *buf)
 {
 	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
-	struct cxl_mem *cxlm = cxlmd->cxlm;
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
 
-	return sysfs_emit(buf, "%zu\n", cxlm->lsa_size);
+	return sysfs_emit(buf, "%zu\n", cxlds->lsa_size);
 }
 static DEVICE_ATTR_RO(label_storage_size);
 
@@ -67,8 +67,8 @@ static ssize_t ram_size_show(struct device *dev, struct device_attribute *attr,
 			     char *buf)
 {
 	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
-	struct cxl_mem *cxlm = cxlmd->cxlm;
-	unsigned long long len = range_len(&cxlm->ram_range);
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	unsigned long long len = range_len(&cxlds->ram_range);
 
 	return sysfs_emit(buf, "%#llx\n", len);
 }
@@ -80,8 +80,8 @@ static ssize_t pmem_size_show(struct device *dev, struct device_attribute *attr,
 			      char *buf)
 {
 	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
-	struct cxl_mem *cxlm = cxlmd->cxlm;
-	unsigned long long len = range_len(&cxlm->pmem_range);
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	unsigned long long len = range_len(&cxlds->pmem_range);
 
 	return sysfs_emit(buf, "%#llx\n", len);
 }
@@ -136,17 +136,17 @@ static const struct device_type cxl_memdev_type = {
 
 /**
  * set_exclusive_cxl_commands() - atomically disable user cxl commands
- * @cxlm: cxl_mem instance to modify
+ * @cxlds: The device state to operate on
  * @cmds: bitmap of commands to mark exclusive
  *
  * Grab the cxl_memdev_rwsem in write mode to flush in-flight
  * invocations of the ioctl path and then disable future execution of
  * commands with the command ids set in @cmds.
  */
-void set_exclusive_cxl_commands(struct cxl_mem *cxlm, unsigned long *cmds)
+void set_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds)
 {
 	down_write(&cxl_memdev_rwsem);
-	bitmap_or(cxlm->exclusive_cmds, cxlm->exclusive_cmds, cmds,
+	bitmap_or(cxlds->exclusive_cmds, cxlds->exclusive_cmds, cmds,
 		  CXL_MEM_COMMAND_ID_MAX);
 	up_write(&cxl_memdev_rwsem);
 }
@@ -154,13 +154,13 @@ EXPORT_SYMBOL_GPL(set_exclusive_cxl_commands);
 
 /**
  * clear_exclusive_cxl_commands() - atomically enable user cxl commands
- * @cxlm: cxl_mem instance to modify
+ * @cxlds: The device state to modify
  * @cmds: bitmap of commands to mark available for userspace
  */
-void clear_exclusive_cxl_commands(struct cxl_mem *cxlm, unsigned long *cmds)
+void clear_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds)
 {
 	down_write(&cxl_memdev_rwsem);
-	bitmap_andnot(cxlm->exclusive_cmds, cxlm->exclusive_cmds, cmds,
+	bitmap_andnot(cxlds->exclusive_cmds, cxlds->exclusive_cmds, cmds,
 		      CXL_MEM_COMMAND_ID_MAX);
 	up_write(&cxl_memdev_rwsem);
 }
@@ -171,7 +171,7 @@ static void cxl_memdev_shutdown(struct device *dev)
 	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
 
 	down_write(&cxl_memdev_rwsem);
-	cxlmd->cxlm = NULL;
+	cxlmd->cxlds = NULL;
 	up_write(&cxl_memdev_rwsem);
 }
 
@@ -185,7 +185,7 @@ static void cxl_memdev_unregister(void *_cxlmd)
 	put_device(dev);
 }
 
-static struct cxl_memdev *cxl_memdev_alloc(struct cxl_mem *cxlm,
+static struct cxl_memdev *cxl_memdev_alloc(struct cxl_dev_state *cxlds,
 					   const struct file_operations *fops)
 {
 	struct cxl_memdev *cxlmd;
@@ -204,7 +204,7 @@ static struct cxl_memdev *cxl_memdev_alloc(struct cxl_mem *cxlm,
 
 	dev = &cxlmd->dev;
 	device_initialize(dev);
-	dev->parent = cxlm->dev;
+	dev->parent = cxlds->dev;
 	dev->bus = &cxl_bus_type;
 	dev->devt = MKDEV(cxl_mem_major, cxlmd->id);
 	dev->type = &cxl_memdev_type;
@@ -239,7 +239,7 @@ static long cxl_memdev_ioctl(struct file *file, unsigned int cmd,
 	int rc = -ENXIO;
 
 	down_read(&cxl_memdev_rwsem);
-	if (cxlmd->cxlm)
+	if (cxlmd->cxlds)
 		rc = __cxl_memdev_ioctl(cxlmd, cmd, arg);
 	up_read(&cxl_memdev_rwsem);
 
@@ -276,15 +276,14 @@ static const struct file_operations cxl_memdev_fops = {
 	.llseek = noop_llseek,
 };
 
-struct cxl_memdev *
-devm_cxl_add_memdev(struct cxl_mem *cxlm)
+struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds)
 {
 	struct cxl_memdev *cxlmd;
 	struct device *dev;
 	struct cdev *cdev;
 	int rc;
 
-	cxlmd = cxl_memdev_alloc(cxlm, &cxl_memdev_fops);
+	cxlmd = cxl_memdev_alloc(cxlds, &cxl_memdev_fops);
 	if (IS_ERR(cxlmd))
 		return cxlmd;
 
@@ -297,14 +296,14 @@ devm_cxl_add_memdev(struct cxl_mem *cxlm)
 	 * Activate ioctl operations, no cxl_memdev_rwsem manipulation
 	 * needed as this is ordered with cdev_add() publishing the device.
 	 */
-	cxlmd->cxlm = cxlm;
+	cxlmd->cxlds = cxlds;
 
 	cdev = &cxlmd->cdev;
 	rc = cdev_device_add(cdev, dev);
 	if (rc)
 		goto err;
 
-	rc = devm_add_action_or_reset(cxlm->dev, cxl_memdev_unregister, cxlmd);
+	rc = devm_add_action_or_reset(cxlds->dev, cxl_memdev_unregister, cxlmd);
 	if (rc)
 		return ERR_PTR(rc);
 	return cxlmd;
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index c4f450ad434d..f978a046591b 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -33,13 +33,13 @@
  * struct cxl_memdev - CXL bus object representing a Type-3 Memory Device
  * @dev: driver core device object
  * @cdev: char dev core object for ioctl operations
- * @cxlm: pointer to the parent device driver data
+ * @cxlds: The device state backing this device
  * @id: id number of this memdev instance.
  */
 struct cxl_memdev {
 	struct device dev;
 	struct cdev cdev;
-	struct cxl_mem *cxlm;
+	struct cxl_dev_state *cxlds;
 	int id;
 };
 
@@ -48,7 +48,7 @@ static inline struct cxl_memdev *to_cxl_memdev(struct device *dev)
 	return container_of(dev, struct cxl_memdev, dev);
 }
 
-struct cxl_memdev *devm_cxl_add_memdev(struct cxl_mem *cxlm);
+struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds);
 
 /**
  * struct cxl_mbox_cmd - A command to be submitted to hardware.
@@ -90,8 +90,13 @@ struct cxl_mbox_cmd {
 #define CXL_CAPACITY_MULTIPLIER SZ_256M
 
 /**
- * struct cxl_mem - A CXL memory device
- * @dev: The device associated with this CXL device.
+ * struct cxl_dev_state - The driver device state
+ *
+ * cxl_dev_state represents the CXL driver/device state.  It provides an
+ * interface to mailbox commands as well as some cached data about the device.
+ * Currently only memory devices are represented.
+ *
+ * @dev: The device associated with this CXL state
  * @cxlmd: Logical memory device chardev / interface
  * @regs: Parsed register blocks
  * @payload_size: Size of space for payload
@@ -117,7 +122,7 @@ struct cxl_mbox_cmd {
  * See section 8.2.9.5.2 Capacity Configuration and Label Storage for
  * details on capacity parameters.
  */
-struct cxl_mem {
+struct cxl_dev_state {
 	struct device *dev;
 	struct cxl_memdev *cxlmd;
 
@@ -142,7 +147,7 @@ struct cxl_mem {
 	u64 next_volatile_bytes;
 	u64 next_persistent_bytes;
 
-	int (*mbox_send)(struct cxl_mem *cxlm, struct cxl_mbox_cmd *cmd);
+	int (*mbox_send)(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd);
 };
 
 enum cxl_opcode {
@@ -253,12 +258,12 @@ struct cxl_mem_command {
 #define CXL_CMD_FLAG_FORCE_ENABLE BIT(0)
 };
 
-int cxl_mem_mbox_send_cmd(struct cxl_mem *cxlm, u16 opcode, void *in,
-			  size_t in_size, void *out, size_t out_size);
-int cxl_mem_identify(struct cxl_mem *cxlm);
-int cxl_mem_enumerate_cmds(struct cxl_mem *cxlm);
-int cxl_mem_create_range_info(struct cxl_mem *cxlm);
-struct cxl_mem *cxl_mem_create(struct device *dev);
-void set_exclusive_cxl_commands(struct cxl_mem *cxlm, unsigned long *cmds);
-void clear_exclusive_cxl_commands(struct cxl_mem *cxlm, unsigned long *cmds);
+int cxl_mbox_send_cmd(struct cxl_dev_state *cxlds, u16 opcode, void *in,
+		      size_t in_size, void *out, size_t out_size);
+int cxl_dev_state_identify(struct cxl_dev_state *cxlds);
+int cxl_enumerate_cmds(struct cxl_dev_state *cxlds);
+int cxl_mem_create_range_info(struct cxl_dev_state *cxlds);
+struct cxl_dev_state *cxl_dev_state_create(struct device *dev);
+void set_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds);
+void clear_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds);
 #endif /* __CXL_MEM_H__ */
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index c734e21fb4e0..8dc91fd3396a 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -28,39 +28,39 @@
  *  - Registers a CXL mailbox with cxl_core.
  */
 
-#define cxl_doorbell_busy(cxlm)                                                \
-	(readl((cxlm)->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET) &                  \
+#define cxl_doorbell_busy(cxlds)                                                \
+	(readl((cxlds)->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET) &                  \
 	 CXLDEV_MBOX_CTRL_DOORBELL)
 
 /* CXL 2.0 - 8.2.8.4 */
 #define CXL_MAILBOX_TIMEOUT_MS (2 * HZ)
 
-static int cxl_pci_mbox_wait_for_doorbell(struct cxl_mem *cxlm)
+static int cxl_pci_mbox_wait_for_doorbell(struct cxl_dev_state *cxlds)
 {
 	const unsigned long start = jiffies;
 	unsigned long end = start;
 
-	while (cxl_doorbell_busy(cxlm)) {
+	while (cxl_doorbell_busy(cxlds)) {
 		end = jiffies;
 
 		if (time_after(end, start + CXL_MAILBOX_TIMEOUT_MS)) {
 			/* Check again in case preempted before timeout test */
-			if (!cxl_doorbell_busy(cxlm))
+			if (!cxl_doorbell_busy(cxlds))
 				break;
 			return -ETIMEDOUT;
 		}
 		cpu_relax();
 	}
 
-	dev_dbg(cxlm->dev, "Doorbell wait took %dms",
+	dev_dbg(cxlds->dev, "Doorbell wait took %dms",
 		jiffies_to_msecs(end) - jiffies_to_msecs(start));
 	return 0;
 }
 
-static void cxl_pci_mbox_timeout(struct cxl_mem *cxlm,
+static void cxl_pci_mbox_timeout(struct cxl_dev_state *cxlds,
 				 struct cxl_mbox_cmd *mbox_cmd)
 {
-	struct device *dev = cxlm->dev;
+	struct device *dev = cxlds->dev;
 
 	dev_dbg(dev, "Mailbox command (opcode: %#x size: %zub) timed out\n",
 		mbox_cmd->opcode, mbox_cmd->size_in);
@@ -68,7 +68,7 @@ static void cxl_pci_mbox_timeout(struct cxl_mem *cxlm,
 
 /**
  * __cxl_pci_mbox_send_cmd() - Execute a mailbox command
- * @cxlm: The CXL memory device to communicate with.
+ * @cxlds: The device state to communicate with.
  * @mbox_cmd: Command to send to the memory device.
  *
  * Context: Any context. Expects mbox_mutex to be held.
@@ -88,16 +88,16 @@ static void cxl_pci_mbox_timeout(struct cxl_mem *cxlm,
  * not need to coordinate with each other. The driver only uses the primary
  * mailbox.
  */
-static int __cxl_pci_mbox_send_cmd(struct cxl_mem *cxlm,
+static int __cxl_pci_mbox_send_cmd(struct cxl_dev_state *cxlds,
 				   struct cxl_mbox_cmd *mbox_cmd)
 {
-	void __iomem *payload = cxlm->regs.mbox + CXLDEV_MBOX_PAYLOAD_OFFSET;
-	struct device *dev = cxlm->dev;
+	void __iomem *payload = cxlds->regs.mbox + CXLDEV_MBOX_PAYLOAD_OFFSET;
+	struct device *dev = cxlds->dev;
 	u64 cmd_reg, status_reg;
 	size_t out_len;
 	int rc;
 
-	lockdep_assert_held(&cxlm->mbox_mutex);
+	lockdep_assert_held(&cxlds->mbox_mutex);
 
 	/*
 	 * Here are the steps from 8.2.8.4 of the CXL 2.0 spec.
@@ -117,7 +117,7 @@ static int __cxl_pci_mbox_send_cmd(struct cxl_mem *cxlm,
 	 */
 
 	/* #1 */
-	if (cxl_doorbell_busy(cxlm)) {
+	if (cxl_doorbell_busy(cxlds)) {
 		dev_err_ratelimited(dev, "Mailbox re-busy after acquiring\n");
 		return -EBUSY;
 	}
@@ -134,22 +134,22 @@ static int __cxl_pci_mbox_send_cmd(struct cxl_mem *cxlm,
 	}
 
 	/* #2, #3 */
-	writeq(cmd_reg, cxlm->regs.mbox + CXLDEV_MBOX_CMD_OFFSET);
+	writeq(cmd_reg, cxlds->regs.mbox + CXLDEV_MBOX_CMD_OFFSET);
 
 	/* #4 */
 	dev_dbg(dev, "Sending command\n");
 	writel(CXLDEV_MBOX_CTRL_DOORBELL,
-	       cxlm->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET);
+	       cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET);
 
 	/* #5 */
-	rc = cxl_pci_mbox_wait_for_doorbell(cxlm);
+	rc = cxl_pci_mbox_wait_for_doorbell(cxlds);
 	if (rc == -ETIMEDOUT) {
-		cxl_pci_mbox_timeout(cxlm, mbox_cmd);
+		cxl_pci_mbox_timeout(cxlds, mbox_cmd);
 		return rc;
 	}
 
 	/* #6 */
-	status_reg = readq(cxlm->regs.mbox + CXLDEV_MBOX_STATUS_OFFSET);
+	status_reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_STATUS_OFFSET);
 	mbox_cmd->return_code =
 		FIELD_GET(CXLDEV_MBOX_STATUS_RET_CODE_MASK, status_reg);
 
@@ -159,7 +159,7 @@ static int __cxl_pci_mbox_send_cmd(struct cxl_mem *cxlm,
 	}
 
 	/* #7 */
-	cmd_reg = readq(cxlm->regs.mbox + CXLDEV_MBOX_CMD_OFFSET);
+	cmd_reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_CMD_OFFSET);
 	out_len = FIELD_GET(CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK, cmd_reg);
 
 	/* #8 */
@@ -171,7 +171,7 @@ static int __cxl_pci_mbox_send_cmd(struct cxl_mem *cxlm,
 		 * have requested less data than the hardware supplied even
 		 * within spec.
 		 */
-		size_t n = min3(mbox_cmd->size_out, cxlm->payload_size, out_len);
+		size_t n = min3(mbox_cmd->size_out, cxlds->payload_size, out_len);
 
 		memcpy_fromio(mbox_cmd->payload_out, payload, n);
 		mbox_cmd->size_out = n;
@@ -184,18 +184,18 @@ static int __cxl_pci_mbox_send_cmd(struct cxl_mem *cxlm,
 
 /**
  * cxl_pci_mbox_get() - Acquire exclusive access to the mailbox.
- * @cxlm: The memory device to gain access to.
+ * @cxlds: The device state to gain access to.
  *
  * Context: Any context. Takes the mbox_mutex.
  * Return: 0 if exclusive access was acquired.
  */
-static int cxl_pci_mbox_get(struct cxl_mem *cxlm)
+static int cxl_pci_mbox_get(struct cxl_dev_state *cxlds)
 {
-	struct device *dev = cxlm->dev;
+	struct device *dev = cxlds->dev;
 	u64 md_status;
 	int rc;
 
-	mutex_lock_io(&cxlm->mbox_mutex);
+	mutex_lock_io(&cxlds->mbox_mutex);
 
 	/*
 	 * XXX: There is some amount of ambiguity in the 2.0 version of the spec
@@ -214,13 +214,13 @@ static int cxl_pci_mbox_get(struct cxl_mem *cxlm)
 	 *    Mailbox Interface Ready bit. Therefore, waiting for the doorbell
 	 *    to be ready is sufficient.
 	 */
-	rc = cxl_pci_mbox_wait_for_doorbell(cxlm);
+	rc = cxl_pci_mbox_wait_for_doorbell(cxlds);
 	if (rc) {
 		dev_warn(dev, "Mailbox interface not ready\n");
 		goto out;
 	}
 
-	md_status = readq(cxlm->regs.memdev + CXLMDEV_STATUS_OFFSET);
+	md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
 	if (!(md_status & CXLMDEV_MBOX_IF_READY && CXLMDEV_READY(md_status))) {
 		dev_err(dev, "mbox: reported doorbell ready, but not mbox ready\n");
 		rc = -EBUSY;
@@ -249,41 +249,41 @@ static int cxl_pci_mbox_get(struct cxl_mem *cxlm)
 	return 0;
 
 out:
-	mutex_unlock(&cxlm->mbox_mutex);
+	mutex_unlock(&cxlds->mbox_mutex);
 	return rc;
 }
 
 /**
  * cxl_pci_mbox_put() - Release exclusive access to the mailbox.
- * @cxlm: The CXL memory device to communicate with.
+ * @cxlds: The device state to communicate with.
  *
  * Context: Any context. Expects mbox_mutex to be held.
  */
-static void cxl_pci_mbox_put(struct cxl_mem *cxlm)
+static void cxl_pci_mbox_put(struct cxl_dev_state *cxlds)
 {
-	mutex_unlock(&cxlm->mbox_mutex);
+	mutex_unlock(&cxlds->mbox_mutex);
 }
 
-static int cxl_pci_mbox_send(struct cxl_mem *cxlm, struct cxl_mbox_cmd *cmd)
+static int cxl_pci_mbox_send(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd)
 {
 	int rc;
 
-	rc = cxl_pci_mbox_get(cxlm);
+	rc = cxl_pci_mbox_get(cxlds);
 	if (rc)
 		return rc;
 
-	rc = __cxl_pci_mbox_send_cmd(cxlm, cmd);
-	cxl_pci_mbox_put(cxlm);
+	rc = __cxl_pci_mbox_send_cmd(cxlds, cmd);
+	cxl_pci_mbox_put(cxlds);
 
 	return rc;
 }
 
-static int cxl_pci_setup_mailbox(struct cxl_mem *cxlm)
+static int cxl_pci_setup_mailbox(struct cxl_dev_state *cxlds)
 {
-	const int cap = readl(cxlm->regs.mbox + CXLDEV_MBOX_CAPS_OFFSET);
+	const int cap = readl(cxlds->regs.mbox + CXLDEV_MBOX_CAPS_OFFSET);
 
-	cxlm->mbox_send = cxl_pci_mbox_send;
-	cxlm->payload_size =
+	cxlds->mbox_send = cxl_pci_mbox_send;
+	cxlds->payload_size =
 		1 << FIELD_GET(CXLDEV_MBOX_CAP_PAYLOAD_SIZE_MASK, cap);
 
 	/*
@@ -293,15 +293,15 @@ static int cxl_pci_setup_mailbox(struct cxl_mem *cxlm)
 	 * there's no point in going forward. If the size is too large, there's
 	 * no harm is soft limiting it.
 	 */
-	cxlm->payload_size = min_t(size_t, cxlm->payload_size, SZ_1M);
-	if (cxlm->payload_size < 256) {
-		dev_err(cxlm->dev, "Mailbox is too small (%zub)",
-			cxlm->payload_size);
+	cxlds->payload_size = min_t(size_t, cxlds->payload_size, SZ_1M);
+	if (cxlds->payload_size < 256) {
+		dev_err(cxlds->dev, "Mailbox is too small (%zub)",
+			cxlds->payload_size);
 		return -ENXIO;
 	}
 
-	dev_dbg(cxlm->dev, "Mailbox payload sized %zu",
-		cxlm->payload_size);
+	dev_dbg(cxlds->dev, "Mailbox payload sized %zu",
+		cxlds->payload_size);
 
 	return 0;
 }
@@ -379,18 +379,18 @@ static int cxl_probe_regs(struct pci_dev *pdev, struct cxl_register_map *map)
 	return 0;
 }
 
-static int cxl_map_regs(struct cxl_mem *cxlm, struct cxl_register_map *map)
+static int cxl_map_regs(struct cxl_dev_state *cxlds, struct cxl_register_map *map)
 {
-	struct device *dev = cxlm->dev;
+	struct device *dev = cxlds->dev;
 	struct pci_dev *pdev = to_pci_dev(dev);
 
 	switch (map->reg_type) {
 	case CXL_REGLOC_RBI_COMPONENT:
-		cxl_map_component_regs(pdev, &cxlm->regs.component, map);
+		cxl_map_component_regs(pdev, &cxlds->regs.component, map);
 		dev_dbg(dev, "Mapping component registers...\n");
 		break;
 	case CXL_REGLOC_RBI_MEMDEV:
-		cxl_map_device_regs(pdev, &cxlm->regs.device_regs, map);
+		cxl_map_device_regs(pdev, &cxlds->regs.device_regs, map);
 		dev_dbg(dev, "Probing device registers...\n");
 		break;
 	default:
@@ -475,7 +475,7 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	struct cxl_register_map map;
 	struct cxl_memdev *cxlmd;
-	struct cxl_mem *cxlm;
+	struct cxl_dev_state *cxlds;
 	int rc;
 
 	/*
@@ -489,39 +489,39 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (rc)
 		return rc;
 
-	cxlm = cxl_mem_create(&pdev->dev);
-	if (IS_ERR(cxlm))
-		return PTR_ERR(cxlm);
+	cxlds = cxl_dev_state_create(&pdev->dev);
+	if (IS_ERR(cxlds))
+		return PTR_ERR(cxlds);
 
 	rc = cxl_setup_regs(pdev, CXL_REGLOC_RBI_MEMDEV, &map);
 	if (rc)
 		return rc;
 
-	rc = cxl_map_regs(cxlm, &map);
+	rc = cxl_map_regs(cxlds, &map);
 	if (rc)
 		return rc;
 
-	rc = cxl_pci_setup_mailbox(cxlm);
+	rc = cxl_pci_setup_mailbox(cxlds);
 	if (rc)
 		return rc;
 
-	rc = cxl_mem_enumerate_cmds(cxlm);
+	rc = cxl_enumerate_cmds(cxlds);
 	if (rc)
 		return rc;
 
-	rc = cxl_mem_identify(cxlm);
+	rc = cxl_dev_state_identify(cxlds);
 	if (rc)
 		return rc;
 
-	rc = cxl_mem_create_range_info(cxlm);
+	rc = cxl_mem_create_range_info(cxlds);
 	if (rc)
 		return rc;
 
-	cxlmd = devm_cxl_add_memdev(cxlm);
+	cxlmd = devm_cxl_add_memdev(cxlds);
 	if (IS_ERR(cxlmd))
 		return PTR_ERR(cxlmd);
 
-	if (range_len(&cxlm->pmem_range) && IS_ENABLED(CONFIG_CXL_PMEM))
+	if (range_len(&cxlds->pmem_range) && IS_ENABLED(CONFIG_CXL_PMEM))
 		rc = devm_cxl_add_nvdimm(&pdev->dev, cxlmd);
 
 	return rc;
diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c
index 38bcbb4e9409..17e82ae90456 100644
--- a/drivers/cxl/pmem.c
+++ b/drivers/cxl/pmem.c
@@ -19,9 +19,9 @@ static struct workqueue_struct *cxl_pmem_wq;
 
 static __read_mostly DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX);
 
-static void clear_exclusive(void *cxlm)
+static void clear_exclusive(void *cxlds)
 {
-	clear_exclusive_cxl_commands(cxlm, exclusive_cmds);
+	clear_exclusive_cxl_commands(cxlds, exclusive_cmds);
 }
 
 static void unregister_nvdimm(void *nvdimm)
@@ -34,7 +34,7 @@ static int cxl_nvdimm_probe(struct device *dev)
 	struct cxl_nvdimm *cxl_nvd = to_cxl_nvdimm(dev);
 	struct cxl_memdev *cxlmd = cxl_nvd->cxlmd;
 	unsigned long flags = 0, cmd_mask = 0;
-	struct cxl_mem *cxlm = cxlmd->cxlm;
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
 	struct cxl_nvdimm_bridge *cxl_nvb;
 	struct nvdimm *nvdimm;
 	int rc;
@@ -49,8 +49,8 @@ static int cxl_nvdimm_probe(struct device *dev)
 		goto out;
 	}
 
-	set_exclusive_cxl_commands(cxlm, exclusive_cmds);
-	rc = devm_add_action_or_reset(dev, clear_exclusive, cxlm);
+	set_exclusive_cxl_commands(cxlds, exclusive_cmds);
+	rc = devm_add_action_or_reset(dev, clear_exclusive, cxlds);
 	if (rc)
 		goto out;
 
@@ -80,7 +80,7 @@ static struct cxl_driver cxl_nvdimm_driver = {
 	.id = CXL_DEVICE_NVDIMM,
 };
 
-static int cxl_pmem_get_config_size(struct cxl_mem *cxlm,
+static int cxl_pmem_get_config_size(struct cxl_dev_state *cxlds,
 				    struct nd_cmd_get_config_size *cmd,
 				    unsigned int buf_len)
 {
@@ -88,14 +88,14 @@ static int cxl_pmem_get_config_size(struct cxl_mem *cxlm,
 		return -EINVAL;
 
 	*cmd = (struct nd_cmd_get_config_size) {
-		 .config_size = cxlm->lsa_size,
-		 .max_xfer = cxlm->payload_size,
+		 .config_size = cxlds->lsa_size,
+		 .max_xfer = cxlds->payload_size,
 	};
 
 	return 0;
 }
 
-static int cxl_pmem_get_config_data(struct cxl_mem *cxlm,
+static int cxl_pmem_get_config_data(struct cxl_dev_state *cxlds,
 				    struct nd_cmd_get_config_data_hdr *cmd,
 				    unsigned int buf_len)
 {
@@ -112,15 +112,14 @@ static int cxl_pmem_get_config_data(struct cxl_mem *cxlm,
 		.length = cmd->in_length,
 	};
 
-	rc = cxl_mem_mbox_send_cmd(cxlm, CXL_MBOX_OP_GET_LSA, &get_lsa,
-				   sizeof(get_lsa), cmd->out_buf,
-				   cmd->in_length);
+	rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_LSA, &get_lsa,
+			       sizeof(get_lsa), cmd->out_buf, cmd->in_length);
 	cmd->status = 0;
 
 	return rc;
 }
 
-static int cxl_pmem_set_config_data(struct cxl_mem *cxlm,
+static int cxl_pmem_set_config_data(struct cxl_dev_state *cxlds,
 				    struct nd_cmd_set_config_hdr *cmd,
 				    unsigned int buf_len)
 {
@@ -144,9 +143,9 @@ static int cxl_pmem_set_config_data(struct cxl_mem *cxlm,
 	};
 	memcpy(set_lsa->data, cmd->in_buf, cmd->in_length);
 
-	rc = cxl_mem_mbox_send_cmd(cxlm, CXL_MBOX_OP_SET_LSA, set_lsa,
-				   struct_size(set_lsa, data, cmd->in_length),
-				   NULL, 0);
+	rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_SET_LSA, set_lsa,
+			       struct_size(set_lsa, data, cmd->in_length),
+			       NULL, 0);
 
 	/*
 	 * Set "firmware" status (4-packed bytes at the end of the input
@@ -164,18 +163,18 @@ static int cxl_pmem_nvdimm_ctl(struct nvdimm *nvdimm, unsigned int cmd,
 	struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
 	unsigned long cmd_mask = nvdimm_cmd_mask(nvdimm);
 	struct cxl_memdev *cxlmd = cxl_nvd->cxlmd;
-	struct cxl_mem *cxlm = cxlmd->cxlm;
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
 
 	if (!test_bit(cmd, &cmd_mask))
 		return -ENOTTY;
 
 	switch (cmd) {
 	case ND_CMD_GET_CONFIG_SIZE:
-		return cxl_pmem_get_config_size(cxlm, buf, buf_len);
+		return cxl_pmem_get_config_size(cxlds, buf, buf_len);
 	case ND_CMD_GET_CONFIG_DATA:
-		return cxl_pmem_get_config_data(cxlm, buf, buf_len);
+		return cxl_pmem_get_config_data(cxlds, buf, buf_len);
 	case ND_CMD_SET_CONFIG_DATA:
-		return cxl_pmem_set_config_data(cxlm, buf, buf_len);
+		return cxl_pmem_set_config_data(cxlds, buf, buf_len);
 	default:
 		return -ENOTTY;
 	}
diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c
index 12a8437a9ca0..555faa4a57f7 100644
--- a/tools/testing/cxl/test/mem.c
+++ b/tools/testing/cxl/test/mem.c
@@ -54,7 +54,7 @@ static int mock_gsl(struct cxl_mbox_cmd *cmd)
 	return 0;
 }
 
-static int mock_get_log(struct cxl_mem *cxlm, struct cxl_mbox_cmd *cmd)
+static int mock_get_log(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd)
 {
 	struct cxl_mbox_get_log *gl = cmd->payload_in;
 	u32 offset = le32_to_cpu(gl->offset);
@@ -64,7 +64,7 @@ static int mock_get_log(struct cxl_mem *cxlm, struct cxl_mbox_cmd *cmd)
 
 	if (cmd->size_in < sizeof(*gl))
 		return -EINVAL;
-	if (length > cxlm->payload_size)
+	if (length > cxlds->payload_size)
 		return -EINVAL;
 	if (offset + length > sizeof(mock_cel))
 		return -EINVAL;
@@ -78,9 +78,9 @@ static int mock_get_log(struct cxl_mem *cxlm, struct cxl_mbox_cmd *cmd)
 	return 0;
 }
 
-static int mock_id(struct cxl_mem *cxlm, struct cxl_mbox_cmd *cmd)
+static int mock_id(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd)
 {
-	struct platform_device *pdev = to_platform_device(cxlm->dev);
+	struct platform_device *pdev = to_platform_device(cxlds->dev);
 	struct cxl_mbox_identify id = {
 		.fw_revision = { "mock fw v1 " },
 		.lsa_size = cpu_to_le32(LSA_SIZE),
@@ -120,10 +120,10 @@ static int mock_id(struct cxl_mem *cxlm, struct cxl_mbox_cmd *cmd)
 	return 0;
 }
 
-static int mock_get_lsa(struct cxl_mem *cxlm, struct cxl_mbox_cmd *cmd)
+static int mock_get_lsa(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd)
 {
 	struct cxl_mbox_get_lsa *get_lsa = cmd->payload_in;
-	void *lsa = dev_get_drvdata(cxlm->dev);
+	void *lsa = dev_get_drvdata(cxlds->dev);
 	u32 offset, length;
 
 	if (sizeof(*get_lsa) > cmd->size_in)
@@ -139,10 +139,10 @@ static int mock_get_lsa(struct cxl_mem *cxlm, struct cxl_mbox_cmd *cmd)
 	return 0;
 }
 
-static int mock_set_lsa(struct cxl_mem *cxlm, struct cxl_mbox_cmd *cmd)
+static int mock_set_lsa(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd)
 {
 	struct cxl_mbox_set_lsa *set_lsa = cmd->payload_in;
-	void *lsa = dev_get_drvdata(cxlm->dev);
+	void *lsa = dev_get_drvdata(cxlds->dev);
 	u32 offset, length;
 
 	if (sizeof(*set_lsa) > cmd->size_in)
@@ -156,9 +156,9 @@ static int mock_set_lsa(struct cxl_mem *cxlm, struct cxl_mbox_cmd *cmd)
 	return 0;
 }
 
-static int cxl_mock_mbox_send(struct cxl_mem *cxlm, struct cxl_mbox_cmd *cmd)
+static int cxl_mock_mbox_send(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd)
 {
-	struct device *dev = cxlm->dev;
+	struct device *dev = cxlds->dev;
 	int rc = -EIO;
 
 	switch (cmd->opcode) {
@@ -166,16 +166,16 @@ static int cxl_mock_mbox_send(struct cxl_mem *cxlm, struct cxl_mbox_cmd *cmd)
 		rc = mock_gsl(cmd);
 		break;
 	case CXL_MBOX_OP_GET_LOG:
-		rc = mock_get_log(cxlm, cmd);
+		rc = mock_get_log(cxlds, cmd);
 		break;
 	case CXL_MBOX_OP_IDENTIFY:
-		rc = mock_id(cxlm, cmd);
+		rc = mock_id(cxlds, cmd);
 		break;
 	case CXL_MBOX_OP_GET_LSA:
-		rc = mock_get_lsa(cxlm, cmd);
+		rc = mock_get_lsa(cxlds, cmd);
 		break;
 	case CXL_MBOX_OP_SET_LSA:
-		rc = mock_set_lsa(cxlm, cmd);
+		rc = mock_set_lsa(cxlds, cmd);
 		break;
 	default:
 		break;
@@ -196,7 +196,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
 	struct cxl_memdev *cxlmd;
-	struct cxl_mem *cxlm;
+	struct cxl_dev_state *cxlds;
 	void *lsa;
 	int rc;
 
@@ -208,30 +208,30 @@ static int cxl_mock_mem_probe(struct platform_device *pdev)
 		return rc;
 	dev_set_drvdata(dev, lsa);
 
-	cxlm = cxl_mem_create(dev);
-	if (IS_ERR(cxlm))
-		return PTR_ERR(cxlm);
+	cxlds = cxl_dev_state_create(dev);
+	if (IS_ERR(cxlds))
+		return PTR_ERR(cxlds);
 
-	cxlm->mbox_send = cxl_mock_mbox_send;
-	cxlm->payload_size = SZ_4K;
+	cxlds->mbox_send = cxl_mock_mbox_send;
+	cxlds->payload_size = SZ_4K;
 
-	rc = cxl_mem_enumerate_cmds(cxlm);
+	rc = cxl_enumerate_cmds(cxlds);
 	if (rc)
 		return rc;
 
-	rc = cxl_mem_identify(cxlm);
+	rc = cxl_dev_state_identify(cxlds);
 	if (rc)
 		return rc;
 
-	rc = cxl_mem_create_range_info(cxlm);
+	rc = cxl_mem_create_range_info(cxlds);
 	if (rc)
 		return rc;
 
-	cxlmd = devm_cxl_add_memdev(cxlm);
+	cxlmd = devm_cxl_add_memdev(cxlds);
 	if (IS_ERR(cxlmd))
 		return PTR_ERR(cxlmd);
 
-	if (range_len(&cxlm->pmem_range) && IS_ENABLED(CONFIG_CXL_PMEM))
+	if (range_len(&cxlds->pmem_range) && IS_ENABLED(CONFIG_CXL_PMEM))
 		rc = devm_cxl_add_nvdimm(dev, cxlmd);
 
 	return 0;
-- 
cgit v1.2.3


From 09eac2ca988a866225a174212765f77c85702206 Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Sun, 17 Oct 2021 23:12:51 -0600
Subject: tools/testing/cxl: add mock output for the GET_HEALTH_INFO command

Add mocked health information for cxl_test memdevs. This allows
cxl-cli's 'list' command to display the canned health_info fields.

Cc: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Link: https://lore.kernel.org/r/20211018051251.2289112-1-vishal.l.verma@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 tools/testing/cxl/test/mem.c | 49 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c
index 555faa4a57f7..8c2086c4caef 100644
--- a/tools/testing/cxl/test/mem.c
+++ b/tools/testing/cxl/test/mem.c
@@ -28,8 +28,24 @@ static struct cxl_cel_entry mock_cel[] = {
 		.opcode = cpu_to_le16(CXL_MBOX_OP_SET_LSA),
 		.effect = cpu_to_le16(EFFECT(1) | EFFECT(2)),
 	},
+	{
+		.opcode = cpu_to_le16(CXL_MBOX_OP_GET_HEALTH_INFO),
+		.effect = cpu_to_le16(0),
+	},
 };
 
+/* See CXL 2.0 Table 181 Get Health Info Output Payload */
+struct cxl_mbox_health_info {
+	u8 health_status;
+	u8 media_status;
+	u8 ext_status;
+	u8 life_used;
+	__le16 temperature;
+	__le32 dirty_shutdowns;
+	__le32 volatile_errors;
+	__le32 pmem_errors;
+} __packed;
+
 static struct {
 	struct cxl_mbox_get_supported_logs gsl;
 	struct cxl_gsl_entry entry;
@@ -156,6 +172,36 @@ static int mock_set_lsa(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd)
 	return 0;
 }
 
+static int mock_health_info(struct cxl_dev_state *cxlds,
+			    struct cxl_mbox_cmd *cmd)
+{
+	struct cxl_mbox_health_info health_info = {
+		/* set flags for maint needed, perf degraded, hw replacement */
+		.health_status = 0x7,
+		/* set media status to "All Data Lost" */
+		.media_status = 0x3,
+		/*
+		 * set ext_status flags for:
+		 *  ext_life_used: normal,
+		 *  ext_temperature: critical,
+		 *  ext_corrected_volatile: warning,
+		 *  ext_corrected_persistent: normal,
+		 */
+		.ext_status = 0x18,
+		.life_used = 15,
+		.temperature = cpu_to_le16(25),
+		.dirty_shutdowns = cpu_to_le32(10),
+		.volatile_errors = cpu_to_le32(20),
+		.pmem_errors = cpu_to_le32(30),
+	};
+
+	if (cmd->size_out < sizeof(health_info))
+		return -EINVAL;
+
+	memcpy(cmd->payload_out, &health_info, sizeof(health_info));
+	return 0;
+}
+
 static int cxl_mock_mbox_send(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd)
 {
 	struct device *dev = cxlds->dev;
@@ -177,6 +223,9 @@ static int cxl_mock_mbox_send(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *
 	case CXL_MBOX_OP_SET_LSA:
 		rc = mock_set_lsa(cxlds, cmd);
 		break;
+	case CXL_MBOX_OP_GET_HEALTH_INFO:
+		rc = mock_health_info(cxlds, cmd);
+		break;
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From 814dff9ae234d70003b8733a637fec621c90f0bc Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 29 Oct 2021 12:51:53 -0700
Subject: cxl/test: Mock acpi_table_parse_cedt()

Now that cxl_acpi has been converted to use the core ACPI CEDT sub-table
parser, update cxl_test to inject CFMWS and CHBS data directly into
cxl_acpi's handlers.

Cc: Alison Schofield <alison.schofield@intel.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/163553711363.2509508.17428994087868269952.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/acpi.c            |  2 ++
 tools/testing/cxl/Kbuild      |  3 +-
 tools/testing/cxl/test/cxl.c  | 68 +++++++++++++++++++++++++++++--------------
 tools/testing/cxl/test/mock.c | 30 ++++++-------------
 tools/testing/cxl/test/mock.h |  6 ++--
 5 files changed, 61 insertions(+), 48 deletions(-)

(limited to 'tools')

diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
index 7820082a2746..91e4072e7649 100644
--- a/drivers/cxl/acpi.c
+++ b/drivers/cxl/acpi.c
@@ -283,6 +283,7 @@ static int add_host_bridge_uport(struct device *match, void *arg)
 }
 
 struct cxl_chbs_context {
+	struct device *dev;
 	unsigned long long uid;
 	resource_size_t chbcr;
 };
@@ -327,6 +328,7 @@ static int add_host_bridge_dport(struct device *match, void *arg)
 	}
 
 	ctx = (struct cxl_chbs_context) {
+		.dev = host,
 		.uid = uid,
 	};
 	acpi_table_parse_cedt(ACPI_CEDT_TYPE_CHBS, cxl_get_chbcr, &ctx);
diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild
index 86deba8308a1..1acdf2fc31c5 100644
--- a/tools/testing/cxl/Kbuild
+++ b/tools/testing/cxl/Kbuild
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
+ldflags-y += --wrap=acpi_table_parse_cedt
 ldflags-y += --wrap=is_acpi_device_node
-ldflags-y += --wrap=acpi_get_table
-ldflags-y += --wrap=acpi_put_table
 ldflags-y += --wrap=acpi_evaluate_integer
 ldflags-y += --wrap=acpi_pci_find_root
 ldflags-y += --wrap=pci_walk_bus
diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c
index cb32f9e27d5d..736d99006fb7 100644
--- a/tools/testing/cxl/test/cxl.c
+++ b/tools/testing/cxl/test/cxl.c
@@ -182,6 +182,13 @@ static struct {
 	},
 };
 
+struct acpi_cedt_cfmws *mock_cfmws[4] = {
+	[0] = &mock_cedt.cfmws0.cfmws,
+	[1] = &mock_cedt.cfmws1.cfmws,
+	[2] = &mock_cedt.cfmws2.cfmws,
+	[3] = &mock_cedt.cfmws3.cfmws,
+};
+
 struct cxl_mock_res {
 	struct list_head list;
 	struct range range;
@@ -232,12 +239,6 @@ static struct cxl_mock_res *alloc_mock_res(resource_size_t size)
 
 static int populate_cedt(void)
 {
-	struct acpi_cedt_cfmws *cfmws[4] = {
-		[0] = &mock_cedt.cfmws0.cfmws,
-		[1] = &mock_cedt.cfmws1.cfmws,
-		[2] = &mock_cedt.cfmws2.cfmws,
-		[3] = &mock_cedt.cfmws3.cfmws,
-	};
 	struct cxl_mock_res *res;
 	int i;
 
@@ -257,8 +258,8 @@ static int populate_cedt(void)
 		chbs->length = size;
 	}
 
-	for (i = 0; i < ARRAY_SIZE(cfmws); i++) {
-		struct acpi_cedt_cfmws *window = cfmws[i];
+	for (i = 0; i < ARRAY_SIZE(mock_cfmws); i++) {
+		struct acpi_cedt_cfmws *window = mock_cfmws[i];
 
 		res = alloc_mock_res(window->window_size);
 		if (!res)
@@ -269,21 +270,44 @@ static int populate_cedt(void)
 	return 0;
 }
 
-static acpi_status mock_acpi_get_table(char *signature, u32 instance,
-				       struct acpi_table_header **out_table)
+/*
+ * WARNING, this hack assumes the format of 'struct
+ * cxl_cfmws_context' and 'struct cxl_chbs_context' share the property that
+ * the first struct member is the device being probed by the cxl_acpi
+ * driver.
+ */
+struct cxl_cedt_context {
+	struct device *dev;
+};
+
+static int mock_acpi_table_parse_cedt(enum acpi_cedt_type id,
+				      acpi_tbl_entry_handler_arg handler_arg,
+				      void *arg)
 {
-	if (instance < U32_MAX || strcmp(signature, ACPI_SIG_CEDT) != 0)
-		return acpi_get_table(signature, instance, out_table);
+	struct cxl_cedt_context *ctx = arg;
+	struct device *dev = ctx->dev;
+	union acpi_subtable_headers *h;
+	unsigned long end;
+	int i;
 
-	*out_table = (struct acpi_table_header *) &mock_cedt;
-	return AE_OK;
-}
+	if (dev != &cxl_acpi->dev)
+		return acpi_table_parse_cedt(id, handler_arg, arg);
 
-static void mock_acpi_put_table(struct acpi_table_header *table)
-{
-	if (table == (struct acpi_table_header *) &mock_cedt)
-		return;
-	acpi_put_table(table);
+	if (id == ACPI_CEDT_TYPE_CHBS)
+		for (i = 0; i < ARRAY_SIZE(mock_cedt.chbs); i++) {
+			h = (union acpi_subtable_headers *)&mock_cedt.chbs[i];
+			end = (unsigned long)&mock_cedt.chbs[i + 1];
+			handler_arg(h, arg, end);
+		}
+
+	if (id == ACPI_CEDT_TYPE_CFMWS)
+		for (i = 0; i < ARRAY_SIZE(mock_cfmws); i++) {
+			h = (union acpi_subtable_headers *) mock_cfmws[i];
+			end = (unsigned long) h + mock_cfmws[i]->header.length;
+			handler_arg(h, arg, end);
+		}
+
+	return 0;
 }
 
 static bool is_mock_bridge(struct device *dev)
@@ -388,8 +412,7 @@ static struct cxl_mock_ops cxl_mock_ops = {
 	.is_mock_port = is_mock_port,
 	.is_mock_dev = is_mock_dev,
 	.mock_port = mock_cxl_root_port,
-	.acpi_get_table = mock_acpi_get_table,
-	.acpi_put_table = mock_acpi_put_table,
+	.acpi_table_parse_cedt = mock_acpi_table_parse_cedt,
 	.acpi_evaluate_integer = mock_acpi_evaluate_integer,
 	.acpi_pci_find_root = mock_acpi_pci_find_root,
 	.list = LIST_HEAD_INIT(cxl_mock_ops.list),
@@ -574,3 +597,4 @@ static __exit void cxl_test_exit(void)
 module_init(cxl_test_init);
 module_exit(cxl_test_exit);
 MODULE_LICENSE("GPL v2");
+MODULE_IMPORT_NS(ACPI);
diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c
index b8c108abcf07..17408f892df4 100644
--- a/tools/testing/cxl/test/mock.c
+++ b/tools/testing/cxl/test/mock.c
@@ -58,36 +58,23 @@ bool __wrap_is_acpi_device_node(const struct fwnode_handle *fwnode)
 }
 EXPORT_SYMBOL(__wrap_is_acpi_device_node);
 
-acpi_status __wrap_acpi_get_table(char *signature, u32 instance,
-				  struct acpi_table_header **out_table)
+int __wrap_acpi_table_parse_cedt(enum acpi_cedt_type id,
+				 acpi_tbl_entry_handler_arg handler_arg,
+				 void *arg)
 {
-	int index;
+	int index, rc;
 	struct cxl_mock_ops *ops = get_cxl_mock_ops(&index);
-	acpi_status status;
 
 	if (ops)
-		status = ops->acpi_get_table(signature, instance, out_table);
+		rc = ops->acpi_table_parse_cedt(id, handler_arg, arg);
 	else
-		status = acpi_get_table(signature, instance, out_table);
+		rc = acpi_table_parse_cedt(id, handler_arg, arg);
 
 	put_cxl_mock_ops(index);
 
-	return status;
-}
-EXPORT_SYMBOL(__wrap_acpi_get_table);
-
-void __wrap_acpi_put_table(struct acpi_table_header *table)
-{
-	int index;
-	struct cxl_mock_ops *ops = get_cxl_mock_ops(&index);
-
-	if (ops)
-		ops->acpi_put_table(table);
-	else
-		acpi_put_table(table);
-	put_cxl_mock_ops(index);
+	return rc;
 }
-EXPORT_SYMBOL(__wrap_acpi_put_table);
+EXPORT_SYMBOL_NS_GPL(__wrap_acpi_table_parse_cedt, ACPI);
 
 acpi_status __wrap_acpi_evaluate_integer(acpi_handle handle,
 					 acpi_string pathname,
@@ -169,3 +156,4 @@ __wrap_nvdimm_bus_register(struct device *dev,
 EXPORT_SYMBOL_GPL(__wrap_nvdimm_bus_register);
 
 MODULE_LICENSE("GPL v2");
+MODULE_IMPORT_NS(ACPI);
diff --git a/tools/testing/cxl/test/mock.h b/tools/testing/cxl/test/mock.h
index 805a94cb3fbe..15ed0fd877e4 100644
--- a/tools/testing/cxl/test/mock.h
+++ b/tools/testing/cxl/test/mock.h
@@ -6,9 +6,9 @@
 struct cxl_mock_ops {
 	struct list_head list;
 	bool (*is_mock_adev)(struct acpi_device *dev);
-	acpi_status (*acpi_get_table)(char *signature, u32 instance,
-				      struct acpi_table_header **out_table);
-	void (*acpi_put_table)(struct acpi_table_header *table);
+	int (*acpi_table_parse_cedt)(enum acpi_cedt_type id,
+				     acpi_tbl_entry_handler_arg handler_arg,
+				     void *arg);
 	bool (*is_mock_bridge)(struct device *dev);
 	acpi_status (*acpi_evaluate_integer)(acpi_handle handle,
 					     acpi_string pathname,
-- 
cgit v1.2.3


From 5064343fb155487362708bacc8c6ab9dc2c52bb8 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 15 Nov 2021 10:35:14 -0800
Subject: selftests/sgx: Fix a benign linker warning

The enclave binary (test_encl.elf) is built with only three sections (tcs,
text, and data) as controlled by its custom linker script.

If gcc is built with "--enable-linker-build-id" (this appears to be a
common configuration even if it is by default off) then gcc
will pass "--build-id" to the linker that will prompt it (the linker) to
write unique bits identifying the linked file to a ".note.gnu.build-id"
section.

The section ".note.gnu.build-id" does not exist in the test enclave
resulting in the following warning emitted by the linker:

/usr/bin/ld: warning: .note.gnu.build-id section discarded, --build-id
ignored

The test enclave does not use the build id within the binary so fix the
warning by passing a build id of "none" to the linker that will disable the
setting from any earlier "--build-id" options and thus disable the attempt
to write the build id to a ".note.gnu.build-id" section that does not
exist.

Link: https://lore.kernel.org/linux-sgx/20191017030340.18301-2-sean.j.christopherson@intel.com/
Suggested-by: Cedric Xing <cedric.xing@intel.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/ca0f8a81fc1e78af9bdbc6a88e0f9c37d82e53f2.1636997631.git.reinette.chatre@intel.com
---
 tools/testing/selftests/sgx/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/sgx/Makefile b/tools/testing/selftests/sgx/Makefile
index 7f12d55b97f8..2956584e1e37 100644
--- a/tools/testing/selftests/sgx/Makefile
+++ b/tools/testing/selftests/sgx/Makefile
@@ -45,7 +45,7 @@ $(OUTPUT)/sign_key.o: sign_key.S
 	$(CC) $(HOST_CFLAGS) -c $< -o $@
 
 $(OUTPUT)/test_encl.elf: test_encl.lds test_encl.c test_encl_bootstrap.S
-	$(CC) $(ENCL_CFLAGS) -T $^ -o $@
+	$(CC) $(ENCL_CFLAGS) -T $^ -o $@ -Wl,--build-id=none
 
 EXTRA_CLEAN := \
 	$(OUTPUT)/test_encl.elf \
-- 
cgit v1.2.3


From 39f62536be2f6160bba7294b5208e240d34703c3 Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko@kernel.org>
Date: Mon, 15 Nov 2021 10:35:15 -0800
Subject: selftests/sgx: Assign source for each segment

Define source per segment so that enclave pages can be added from different
sources, e.g. anonymous VMA for zero pages. In other words, add 'src' field
to struct encl_segment, and assign it to 'encl->src' for pages inherited
from the enclave binary.

Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/7850709c3089fe20e4bcecb8295ba87c54cc2b4a.1636997631.git.reinette.chatre@intel.com
---
 tools/testing/selftests/sgx/load.c      | 5 +++--
 tools/testing/selftests/sgx/main.h      | 1 +
 tools/testing/selftests/sgx/sigstruct.c | 8 ++++----
 3 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/sgx/load.c b/tools/testing/selftests/sgx/load.c
index 3ebe5d1fe337..5605474aab73 100644
--- a/tools/testing/selftests/sgx/load.c
+++ b/tools/testing/selftests/sgx/load.c
@@ -107,7 +107,7 @@ static bool encl_ioc_add_pages(struct encl *encl, struct encl_segment *seg)
 	memset(&secinfo, 0, sizeof(secinfo));
 	secinfo.flags = seg->flags;
 
-	ioc.src = (uint64_t)encl->src + seg->offset;
+	ioc.src = (uint64_t)seg->src;
 	ioc.offset = seg->offset;
 	ioc.length = seg->size;
 	ioc.secinfo = (unsigned long)&secinfo;
@@ -216,6 +216,7 @@ bool encl_load(const char *path, struct encl *encl)
 
 		if (j == 0) {
 			src_offset = phdr->p_offset & PAGE_MASK;
+			encl->src = encl->bin + src_offset;
 
 			seg->prot = PROT_READ | PROT_WRITE;
 			seg->flags = SGX_PAGE_TYPE_TCS << 8;
@@ -228,13 +229,13 @@ bool encl_load(const char *path, struct encl *encl)
 
 		seg->offset = (phdr->p_offset & PAGE_MASK) - src_offset;
 		seg->size = (phdr->p_filesz + PAGE_SIZE - 1) & PAGE_MASK;
+		seg->src = encl->src + seg->offset;
 
 		j++;
 	}
 
 	assert(j == encl->nr_segments);
 
-	encl->src = encl->bin + src_offset;
 	encl->src_size = encl->segment_tbl[j - 1].offset +
 			 encl->segment_tbl[j - 1].size;
 
diff --git a/tools/testing/selftests/sgx/main.h b/tools/testing/selftests/sgx/main.h
index 68672fd86cf9..452d11dc4889 100644
--- a/tools/testing/selftests/sgx/main.h
+++ b/tools/testing/selftests/sgx/main.h
@@ -7,6 +7,7 @@
 #define MAIN_H
 
 struct encl_segment {
+	void *src;
 	off_t offset;
 	size_t size;
 	unsigned int prot;
diff --git a/tools/testing/selftests/sgx/sigstruct.c b/tools/testing/selftests/sgx/sigstruct.c
index 92bbc5a15c39..202a96fd81bf 100644
--- a/tools/testing/selftests/sgx/sigstruct.c
+++ b/tools/testing/selftests/sgx/sigstruct.c
@@ -289,14 +289,14 @@ static bool mrenclave_eextend(EVP_MD_CTX *ctx, uint64_t offset,
 static bool mrenclave_segment(EVP_MD_CTX *ctx, struct encl *encl,
 			      struct encl_segment *seg)
 {
-	uint64_t end = seg->offset + seg->size;
+	uint64_t end = seg->size;
 	uint64_t offset;
 
-	for (offset = seg->offset; offset < end; offset += PAGE_SIZE) {
-		if (!mrenclave_eadd(ctx, offset, seg->flags))
+	for (offset = 0; offset < end; offset += PAGE_SIZE) {
+		if (!mrenclave_eadd(ctx, seg->offset + offset, seg->flags))
 			return false;
 
-		if (!mrenclave_eextend(ctx, offset, encl->src + offset))
+		if (!mrenclave_eextend(ctx, seg->offset + offset, seg->src + offset))
 			return false;
 	}
 
-- 
cgit v1.2.3


From 5f0ce664d8c6c160ce4333e809545a8a57fe2baf Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko@kernel.org>
Date: Mon, 15 Nov 2021 10:35:16 -0800
Subject: selftests/sgx: Make data measurement for an enclave segment optional

For a heap makes sense to leave its contents "unmeasured" in the SGX
enclave build process, meaning that they won't contribute to the
cryptographic signature (a RSA-3072 signed SHA56 hash) of the enclave.

Enclaves are signed blobs where the signature is calculated both from
page data and also from "structural properties" of the pages.  For
instance a page offset of *every* page added to the enclave is hashed.

For data, this is optional, not least because hashing a page has a
significant contribution to the enclave load time. Thus, where there is
no reason to hash, do not. The SGX ioctl interface supports this with
SGX_PAGE_MEASURE flag. Only when the flag is *set*, data is measured.

Add seg->measure boolean flag to struct encl_segment. Only when the
flag is set, include the segment data to the signature (represented
by SIGSTRUCT architectural structure).

Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/625b6fe28fed76275e9238ec4e15ec3c0d87de81.1636997631.git.reinette.chatre@intel.com
---
 tools/testing/selftests/sgx/load.c      | 6 +++++-
 tools/testing/selftests/sgx/main.h      | 1 +
 tools/testing/selftests/sgx/sigstruct.c | 6 ++++--
 3 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/sgx/load.c b/tools/testing/selftests/sgx/load.c
index 5605474aab73..f1be78984c50 100644
--- a/tools/testing/selftests/sgx/load.c
+++ b/tools/testing/selftests/sgx/load.c
@@ -111,7 +111,10 @@ static bool encl_ioc_add_pages(struct encl *encl, struct encl_segment *seg)
 	ioc.offset = seg->offset;
 	ioc.length = seg->size;
 	ioc.secinfo = (unsigned long)&secinfo;
-	ioc.flags = SGX_PAGE_MEASURE;
+	if (seg->measure)
+		ioc.flags = SGX_PAGE_MEASURE;
+	else
+		ioc.flags = 0;
 
 	rc = ioctl(encl->fd, SGX_IOC_ENCLAVE_ADD_PAGES, &ioc);
 	if (rc < 0) {
@@ -230,6 +233,7 @@ bool encl_load(const char *path, struct encl *encl)
 		seg->offset = (phdr->p_offset & PAGE_MASK) - src_offset;
 		seg->size = (phdr->p_filesz + PAGE_SIZE - 1) & PAGE_MASK;
 		seg->src = encl->src + seg->offset;
+		seg->measure = true;
 
 		j++;
 	}
diff --git a/tools/testing/selftests/sgx/main.h b/tools/testing/selftests/sgx/main.h
index 452d11dc4889..aebc69e7cdc8 100644
--- a/tools/testing/selftests/sgx/main.h
+++ b/tools/testing/selftests/sgx/main.h
@@ -12,6 +12,7 @@ struct encl_segment {
 	size_t size;
 	unsigned int prot;
 	unsigned int flags;
+	bool measure;
 };
 
 struct encl {
diff --git a/tools/testing/selftests/sgx/sigstruct.c b/tools/testing/selftests/sgx/sigstruct.c
index 202a96fd81bf..50c5ab1aa6fa 100644
--- a/tools/testing/selftests/sgx/sigstruct.c
+++ b/tools/testing/selftests/sgx/sigstruct.c
@@ -296,8 +296,10 @@ static bool mrenclave_segment(EVP_MD_CTX *ctx, struct encl *encl,
 		if (!mrenclave_eadd(ctx, seg->offset + offset, seg->flags))
 			return false;
 
-		if (!mrenclave_eextend(ctx, seg->offset + offset, seg->src + offset))
-			return false;
+		if (seg->measure) {
+			if (!mrenclave_eextend(ctx, seg->offset + offset, seg->src + offset))
+				return false;
+		}
 	}
 
 	return true;
-- 
cgit v1.2.3


From 3200505d4de6436af799d7be743d9dc87450ee5a Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko@kernel.org>
Date: Mon, 15 Nov 2021 10:35:17 -0800
Subject: selftests/sgx: Create a heap for the test enclave

Create a heap for the test enclave, which is allocated from /dev/null,
and left unmeasured. This is beneficial by its own because it verifies
that an enclave built from multiple choices, works properly. If LSM
hooks are added for SGX some day, a multi source enclave has higher
probability to trigger bugs on access control checks.

The immediate need comes from the need to implement page reclaim tests.
In order to trigger the page reclaimer, one can just set the size of
the heap to high enough.

Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/e070c5f23578c29608051cab879b1d276963a27a.1636997631.git.reinette.chatre@intel.com
---
 tools/testing/selftests/sgx/load.c | 29 ++++++++++++++++++++++-------
 tools/testing/selftests/sgx/main.c |  2 +-
 tools/testing/selftests/sgx/main.h |  4 +++-
 3 files changed, 26 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/sgx/load.c b/tools/testing/selftests/sgx/load.c
index f1be78984c50..9d4322c946e2 100644
--- a/tools/testing/selftests/sgx/load.c
+++ b/tools/testing/selftests/sgx/load.c
@@ -21,6 +21,8 @@
 
 void encl_delete(struct encl *encl)
 {
+	struct encl_segment *heap_seg = &encl->segment_tbl[encl->nr_segments - 1];
+
 	if (encl->encl_base)
 		munmap((void *)encl->encl_base, encl->encl_size);
 
@@ -30,6 +32,8 @@ void encl_delete(struct encl *encl)
 	if (encl->fd)
 		close(encl->fd);
 
+	munmap(heap_seg->src, heap_seg->size);
+
 	if (encl->segment_tbl)
 		free(encl->segment_tbl);
 
@@ -125,11 +129,10 @@ static bool encl_ioc_add_pages(struct encl *encl, struct encl_segment *seg)
 	return true;
 }
 
-
-
-bool encl_load(const char *path, struct encl *encl)
+bool encl_load(const char *path, struct encl *encl, unsigned long heap_size)
 {
 	const char device_path[] = "/dev/sgx_enclave";
+	struct encl_segment *seg;
 	Elf64_Phdr *phdr_tbl;
 	off_t src_offset;
 	Elf64_Ehdr *ehdr;
@@ -181,6 +184,8 @@ bool encl_load(const char *path, struct encl *encl)
 	ehdr = encl->bin;
 	phdr_tbl = encl->bin + ehdr->e_phoff;
 
+	encl->nr_segments = 1; /* one for the heap */
+
 	for (i = 0; i < ehdr->e_phnum; i++) {
 		Elf64_Phdr *phdr = &phdr_tbl[i];
 
@@ -196,7 +201,6 @@ bool encl_load(const char *path, struct encl *encl)
 	for (i = 0, j = 0; i < ehdr->e_phnum; i++) {
 		Elf64_Phdr *phdr = &phdr_tbl[i];
 		unsigned int flags = phdr->p_flags;
-		struct encl_segment *seg;
 
 		if (phdr->p_type != PT_LOAD)
 			continue;
@@ -238,10 +242,21 @@ bool encl_load(const char *path, struct encl *encl)
 		j++;
 	}
 
-	assert(j == encl->nr_segments);
+	assert(j == encl->nr_segments - 1);
+
+	seg = &encl->segment_tbl[j];
+	seg->offset =  encl->segment_tbl[j - 1].offset + encl->segment_tbl[j - 1].size;
+	seg->size = heap_size;
+	seg->src = mmap(NULL, heap_size, PROT_READ | PROT_WRITE,
+			MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+	seg->prot = PROT_READ | PROT_WRITE;
+	seg->flags = (SGX_PAGE_TYPE_REG << 8) | seg->prot;
+	seg->measure = false;
+
+	if (seg->src == MAP_FAILED)
+		goto err;
 
-	encl->src_size = encl->segment_tbl[j - 1].offset +
-			 encl->segment_tbl[j - 1].size;
+	encl->src_size = encl->segment_tbl[j].offset + encl->segment_tbl[j].size;
 
 	for (encl->encl_size = 4096; encl->encl_size < encl->src_size; )
 		encl->encl_size <<= 1;
diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c
index e252015e0c15..6858a35fed20 100644
--- a/tools/testing/selftests/sgx/main.c
+++ b/tools/testing/selftests/sgx/main.c
@@ -122,7 +122,7 @@ FIXTURE_SETUP(enclave)
 	unsigned int i;
 	void *addr;
 
-	if (!encl_load("test_encl.elf", &self->encl)) {
+	if (!encl_load("test_encl.elf", &self->encl, ENCL_HEAP_SIZE_DEFAULT)) {
 		encl_delete(&self->encl);
 		ksft_exit_skip("cannot load enclaves\n");
 	}
diff --git a/tools/testing/selftests/sgx/main.h b/tools/testing/selftests/sgx/main.h
index aebc69e7cdc8..b45c52ec7ab3 100644
--- a/tools/testing/selftests/sgx/main.h
+++ b/tools/testing/selftests/sgx/main.h
@@ -6,6 +6,8 @@
 #ifndef MAIN_H
 #define MAIN_H
 
+#define ENCL_HEAP_SIZE_DEFAULT	4096
+
 struct encl_segment {
 	void *src;
 	off_t offset;
@@ -33,7 +35,7 @@ extern unsigned char sign_key[];
 extern unsigned char sign_key_end[];
 
 void encl_delete(struct encl *ctx);
-bool encl_load(const char *path, struct encl *encl);
+bool encl_load(const char *path, struct encl *encl, unsigned long heap_size);
 bool encl_measure(struct encl *encl);
 bool encl_build(struct encl *encl);
 
-- 
cgit v1.2.3


From 1471721489090515f9f0f059b25124898928e559 Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko@kernel.org>
Date: Mon, 15 Nov 2021 10:35:18 -0800
Subject: selftests/sgx: Dump segments and /proc/self/maps only on failure

Logging is always a compromise between clarity and detail. The main use
case for dumping VMA's is when FIXTURE_SETUP() fails, and is less important
for enclaves that do initialize correctly. Therefore, print the segments
and /proc/self/maps only in the error case.

Finally, if a single test ever creates multiple enclaves, the amount of
log lines would become enormous.

Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/23cef0ae1de3a8a74cbfbbe74eca48ca3f300fde.1636997631.git.reinette.chatre@intel.com
---
 tools/testing/selftests/sgx/main.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c
index 6858a35fed20..deab02f2f3ce 100644
--- a/tools/testing/selftests/sgx/main.c
+++ b/tools/testing/selftests/sgx/main.c
@@ -127,12 +127,6 @@ FIXTURE_SETUP(enclave)
 		ksft_exit_skip("cannot load enclaves\n");
 	}
 
-	for (i = 0; i < self->encl.nr_segments; i++) {
-		seg = &self->encl.segment_tbl[i];
-
-		TH_LOG("0x%016lx 0x%016lx 0x%02x", seg->offset, seg->size, seg->prot);
-	}
-
 	if (!encl_measure(&self->encl))
 		goto err;
 
@@ -169,6 +163,17 @@ FIXTURE_SETUP(enclave)
 	memset(&self->run, 0, sizeof(self->run));
 	self->run.tcs = self->encl.encl_base;
 
+	return;
+
+err:
+	encl_delete(&self->encl);
+
+	for (i = 0; i < self->encl.nr_segments; i++) {
+		seg = &self->encl.segment_tbl[i];
+
+		TH_LOG("0x%016lx 0x%016lx 0x%02x", seg->offset, seg->size, seg->prot);
+	}
+
 	maps_file = fopen("/proc/self/maps", "r");
 	if (maps_file != NULL)  {
 		while (fgets(maps_line, sizeof(maps_line), maps_file) != NULL) {
@@ -181,11 +186,7 @@ FIXTURE_SETUP(enclave)
 		fclose(maps_file);
 	}
 
-err:
-	if (!sgx_enter_enclave_sym)
-		encl_delete(&self->encl);
-
-	ASSERT_NE(sgx_enter_enclave_sym, NULL);
+	ASSERT_TRUE(false);
 }
 
 FIXTURE_TEARDOWN(enclave)
-- 
cgit v1.2.3


From 1b35eb719549ab5143d61f9e09b0771cd3d00d94 Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko@kernel.org>
Date: Mon, 15 Nov 2021 10:35:19 -0800
Subject: selftests/sgx: Encpsulate the test enclave creation

Introduce setup_test_encl() so that the enclave creation can be moved to
TEST_F()'s. This is required for a reclaimer test where the heap size needs
to be set large enough to triger the page reclaimer.

Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/bee0ca867a95828a569c1ba2a8e443a44047dc71.1636997631.git.reinette.chatre@intel.com
---
 tools/testing/selftests/sgx/main.c | 44 ++++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 18 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c
index deab02f2f3ce..5b3e49a36344 100644
--- a/tools/testing/selftests/sgx/main.c
+++ b/tools/testing/selftests/sgx/main.c
@@ -112,7 +112,8 @@ FIXTURE(enclave) {
 	struct sgx_enclave_run run;
 };
 
-FIXTURE_SETUP(enclave)
+static bool setup_test_encl(unsigned long heap_size, struct encl *encl,
+			    struct __test_metadata *_metadata)
 {
 	Elf64_Sym *sgx_enter_enclave_sym = NULL;
 	struct vdso_symtab symtab;
@@ -122,25 +123,25 @@ FIXTURE_SETUP(enclave)
 	unsigned int i;
 	void *addr;
 
-	if (!encl_load("test_encl.elf", &self->encl, ENCL_HEAP_SIZE_DEFAULT)) {
-		encl_delete(&self->encl);
-		ksft_exit_skip("cannot load enclaves\n");
+	if (!encl_load("test_encl.elf", encl, heap_size)) {
+		encl_delete(encl);
+		TH_LOG("Failed to load the test enclave.\n");
 	}
 
-	if (!encl_measure(&self->encl))
+	if (!encl_measure(encl))
 		goto err;
 
-	if (!encl_build(&self->encl))
+	if (!encl_build(encl))
 		goto err;
 
 	/*
 	 * An enclave consumer only must do this.
 	 */
-	for (i = 0; i < self->encl.nr_segments; i++) {
-		struct encl_segment *seg = &self->encl.segment_tbl[i];
+	for (i = 0; i < encl->nr_segments; i++) {
+		struct encl_segment *seg = &encl->segment_tbl[i];
 
-		addr = mmap((void *)self->encl.encl_base + seg->offset, seg->size,
-			    seg->prot, MAP_SHARED | MAP_FIXED, self->encl.fd, 0);
+		addr = mmap((void *)encl->encl_base + seg->offset, seg->size,
+			    seg->prot, MAP_SHARED | MAP_FIXED, encl->fd, 0);
 		EXPECT_NE(addr, MAP_FAILED);
 		if (addr == MAP_FAILED)
 			goto err;
@@ -160,16 +161,13 @@ FIXTURE_SETUP(enclave)
 
 	vdso_sgx_enter_enclave = addr + sgx_enter_enclave_sym->st_value;
 
-	memset(&self->run, 0, sizeof(self->run));
-	self->run.tcs = self->encl.encl_base;
-
-	return;
+	return true;
 
 err:
-	encl_delete(&self->encl);
+	encl_delete(encl);
 
-	for (i = 0; i < self->encl.nr_segments; i++) {
-		seg = &self->encl.segment_tbl[i];
+	for (i = 0; i < encl->nr_segments; i++) {
+		seg = &encl->segment_tbl[i];
 
 		TH_LOG("0x%016lx 0x%016lx 0x%02x", seg->offset, seg->size, seg->prot);
 	}
@@ -186,7 +184,17 @@ err:
 		fclose(maps_file);
 	}
 
-	ASSERT_TRUE(false);
+	TH_LOG("Failed to initialize the test enclave.\n");
+
+	return false;
+}
+
+FIXTURE_SETUP(enclave)
+{
+	ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata));
+
+	memset(&self->run, 0, sizeof(self->run));
+	self->run.tcs = self->encl.encl_base;
 }
 
 FIXTURE_TEARDOWN(enclave)
-- 
cgit v1.2.3


From 065825db1fd60aa7695565613a69ed086a831869 Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko@kernel.org>
Date: Mon, 15 Nov 2021 10:35:20 -0800
Subject: selftests/sgx: Move setup_test_encl() to each TEST_F()

Create the test enclave inside each TEST_F(), instead of FIXTURE_SETUP(),
so that the heap size can be defined per test.

Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/70ca264535d2ca0dc8dcaf2281e7d6965f8d4a24.1636997631.git.reinette.chatre@intel.com
---
 tools/testing/selftests/sgx/main.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c
index 5b3e49a36344..f41fba919d06 100644
--- a/tools/testing/selftests/sgx/main.c
+++ b/tools/testing/selftests/sgx/main.c
@@ -191,10 +191,6 @@ err:
 
 FIXTURE_SETUP(enclave)
 {
-	ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata));
-
-	memset(&self->run, 0, sizeof(self->run));
-	self->run.tcs = self->encl.encl_base;
 }
 
 FIXTURE_TEARDOWN(enclave)
@@ -226,6 +222,11 @@ TEST_F(enclave, unclobbered_vdso)
 {
 	struct encl_op op;
 
+	ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata));
+
+	memset(&self->run, 0, sizeof(self->run));
+	self->run.tcs = self->encl.encl_base;
+
 	op.type = ENCL_OP_PUT;
 	op.buffer = MAGIC;
 
@@ -248,6 +249,11 @@ TEST_F(enclave, clobbered_vdso)
 {
 	struct encl_op op;
 
+	ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata));
+
+	memset(&self->run, 0, sizeof(self->run));
+	self->run.tcs = self->encl.encl_base;
+
 	op.type = ENCL_OP_PUT;
 	op.buffer = MAGIC;
 
@@ -278,6 +284,11 @@ TEST_F(enclave, clobbered_vdso_and_user_function)
 {
 	struct encl_op op;
 
+	ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata));
+
+	memset(&self->run, 0, sizeof(self->run));
+	self->run.tcs = self->encl.encl_base;
+
 	self->run.user_handler = (__u64)test_handler;
 	self->run.user_data = 0xdeadbeef;
 
-- 
cgit v1.2.3


From f0ff2447b8613b883f41ae845b6cc7540d6e5f71 Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko@kernel.org>
Date: Mon, 15 Nov 2021 10:35:21 -0800
Subject: selftests/sgx: Add a new kselftest: Unclobbered_vdso_oversubscribed

Add a variation of the unclobbered_vdso test.

In the new test, create a heap for the test enclave, which has the same
size as all available Enclave Page Cache (EPC) pages in the system. This
will guarantee that all test_encl.elf pages *and* SGX Enclave Control
Structure (SECS) have been swapped out by the page reclaimer during the
load time.

This test will trigger both the page reclaimer and the page fault handler.
The page reclaimer triggered, while the heap is being created during the
load time. The page fault handler is triggered for all the required pages,
while the test case is executing.

Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/41f7c508eea79a3198b5014d7691903be08f9ff1.1636997631.git.reinette.chatre@intel.com
---
 tools/testing/selftests/sgx/main.c | 75 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c
index f41fba919d06..ee8139a22a3c 100644
--- a/tools/testing/selftests/sgx/main.c
+++ b/tools/testing/selftests/sgx/main.c
@@ -245,6 +245,81 @@ TEST_F(enclave, unclobbered_vdso)
 	EXPECT_EQ(self->run.user_data, 0);
 }
 
+/*
+ * A section metric is concatenated in a way that @low bits 12-31 define the
+ * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the
+ * metric.
+ */
+static unsigned long sgx_calc_section_metric(unsigned int low,
+					     unsigned int high)
+{
+	return (low & GENMASK_ULL(31, 12)) +
+	       ((high & GENMASK_ULL(19, 0)) << 32);
+}
+
+/*
+ * Sum total available physical SGX memory across all EPC sections
+ *
+ * Return: total available physical SGX memory available on system
+ */
+static unsigned long get_total_epc_mem(void)
+{
+	unsigned int eax, ebx, ecx, edx;
+	unsigned long total_size = 0;
+	unsigned int type;
+	int section = 0;
+
+	while (true) {
+		eax = SGX_CPUID;
+		ecx = section + SGX_CPUID_EPC;
+		__cpuid(&eax, &ebx, &ecx, &edx);
+
+		type = eax & SGX_CPUID_EPC_MASK;
+		if (type == SGX_CPUID_EPC_INVALID)
+			break;
+
+		if (type != SGX_CPUID_EPC_SECTION)
+			break;
+
+		total_size += sgx_calc_section_metric(ecx, edx);
+
+		section++;
+	}
+
+	return total_size;
+}
+
+TEST_F(enclave, unclobbered_vdso_oversubscribed)
+{
+	unsigned long total_mem;
+	struct encl_op op;
+
+	total_mem = get_total_epc_mem();
+	ASSERT_NE(total_mem, 0);
+	ASSERT_TRUE(setup_test_encl(total_mem, &self->encl, _metadata));
+
+	memset(&self->run, 0, sizeof(self->run));
+	self->run.tcs = self->encl.encl_base;
+
+	op.type = ENCL_OP_PUT;
+	op.buffer = MAGIC;
+
+	EXPECT_EQ(ENCL_CALL(&op, &self->run, false), 0);
+
+	EXPECT_EEXIT(&self->run);
+	EXPECT_EQ(self->run.user_data, 0);
+
+	op.type = ENCL_OP_GET;
+	op.buffer = 0;
+
+	EXPECT_EQ(ENCL_CALL(&op, &self->run, false), 0);
+
+	EXPECT_EQ(op.buffer, MAGIC);
+	EXPECT_EEXIT(&self->run);
+	EXPECT_EQ(self->run.user_data, 0);
+
+}
+
 TEST_F(enclave, clobbered_vdso)
 {
 	struct encl_op op;
-- 
cgit v1.2.3


From 41493a095e487c207b4b702aee2f8c59a7294e4f Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko@kernel.org>
Date: Mon, 15 Nov 2021 10:35:22 -0800
Subject: selftests/sgx: Provide per-op parameter structs for the test enclave

To add more operations to the test enclave, the protocol needs to allow
to have operations with varying parameters. Create a separate parameter
struct for each existing operation, with the shared parameters in struct
encl_op_header.

[reinette: rebased to apply on top of oversubscription test series]
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/f9a4a8c436b538003b8ebddaa66083992053cef1.1636997631.git.reinette.chatre@intel.com
---
 tools/testing/selftests/sgx/defines.h   | 14 ++++++-
 tools/testing/selftests/sgx/main.c      | 68 +++++++++++++++++----------------
 tools/testing/selftests/sgx/test_encl.c | 33 ++++++++++------
 3 files changed, 69 insertions(+), 46 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/sgx/defines.h b/tools/testing/selftests/sgx/defines.h
index f88562afcaa0..6ff95a766287 100644
--- a/tools/testing/selftests/sgx/defines.h
+++ b/tools/testing/selftests/sgx/defines.h
@@ -21,11 +21,21 @@
 enum encl_op_type {
 	ENCL_OP_PUT,
 	ENCL_OP_GET,
+	ENCL_OP_MAX,
 };
 
-struct encl_op {
+struct encl_op_header {
 	uint64_t type;
-	uint64_t buffer;
+};
+
+struct encl_op_put {
+	struct encl_op_header header;
+	uint64_t value;
+};
+
+struct encl_op_get {
+	struct encl_op_header header;
+	uint64_t value;
 };
 
 #endif /* DEFINES_H */
diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c
index ee8139a22a3c..3996c00486b6 100644
--- a/tools/testing/selftests/sgx/main.c
+++ b/tools/testing/selftests/sgx/main.c
@@ -220,27 +220,28 @@ FIXTURE_TEARDOWN(enclave)
 
 TEST_F(enclave, unclobbered_vdso)
 {
-	struct encl_op op;
+	struct encl_op_put put_op;
+	struct encl_op_get get_op;
 
 	ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata));
 
 	memset(&self->run, 0, sizeof(self->run));
 	self->run.tcs = self->encl.encl_base;
 
-	op.type = ENCL_OP_PUT;
-	op.buffer = MAGIC;
+	put_op.header.type = ENCL_OP_PUT;
+	put_op.value = MAGIC;
 
-	EXPECT_EQ(ENCL_CALL(&op, &self->run, false), 0);
+	EXPECT_EQ(ENCL_CALL(&put_op, &self->run, false), 0);
 
 	EXPECT_EEXIT(&self->run);
 	EXPECT_EQ(self->run.user_data, 0);
 
-	op.type = ENCL_OP_GET;
-	op.buffer = 0;
+	get_op.header.type = ENCL_OP_GET;
+	get_op.value = 0;
 
-	EXPECT_EQ(ENCL_CALL(&op, &self->run, false), 0);
+	EXPECT_EQ(ENCL_CALL(&get_op, &self->run, false), 0);
 
-	EXPECT_EQ(op.buffer, MAGIC);
+	EXPECT_EQ(get_op.value, MAGIC);
 	EXPECT_EEXIT(&self->run);
 	EXPECT_EQ(self->run.user_data, 0);
 }
@@ -292,7 +293,8 @@ static unsigned long get_total_epc_mem(void)
 TEST_F(enclave, unclobbered_vdso_oversubscribed)
 {
 	unsigned long total_mem;
-	struct encl_op op;
+	struct encl_op_put put_op;
+	struct encl_op_get get_op;
 
 	total_mem = get_total_epc_mem();
 	ASSERT_NE(total_mem, 0);
@@ -301,20 +303,20 @@ TEST_F(enclave, unclobbered_vdso_oversubscribed)
 	memset(&self->run, 0, sizeof(self->run));
 	self->run.tcs = self->encl.encl_base;
 
-	op.type = ENCL_OP_PUT;
-	op.buffer = MAGIC;
+	put_op.header.type = ENCL_OP_PUT;
+	put_op.value = MAGIC;
 
-	EXPECT_EQ(ENCL_CALL(&op, &self->run, false), 0);
+	EXPECT_EQ(ENCL_CALL(&put_op, &self->run, false), 0);
 
 	EXPECT_EEXIT(&self->run);
 	EXPECT_EQ(self->run.user_data, 0);
 
-	op.type = ENCL_OP_GET;
-	op.buffer = 0;
+	get_op.header.type = ENCL_OP_GET;
+	get_op.value = 0;
 
-	EXPECT_EQ(ENCL_CALL(&op, &self->run, false), 0);
+	EXPECT_EQ(ENCL_CALL(&get_op, &self->run, false), 0);
 
-	EXPECT_EQ(op.buffer, MAGIC);
+	EXPECT_EQ(get_op.value, MAGIC);
 	EXPECT_EEXIT(&self->run);
 	EXPECT_EQ(self->run.user_data, 0);
 
@@ -322,27 +324,28 @@ TEST_F(enclave, unclobbered_vdso_oversubscribed)
 
 TEST_F(enclave, clobbered_vdso)
 {
-	struct encl_op op;
+	struct encl_op_put put_op;
+	struct encl_op_get get_op;
 
 	ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata));
 
 	memset(&self->run, 0, sizeof(self->run));
 	self->run.tcs = self->encl.encl_base;
 
-	op.type = ENCL_OP_PUT;
-	op.buffer = MAGIC;
+	put_op.header.type = ENCL_OP_PUT;
+	put_op.value = MAGIC;
 
-	EXPECT_EQ(ENCL_CALL(&op, &self->run, true), 0);
+	EXPECT_EQ(ENCL_CALL(&put_op, &self->run, true), 0);
 
 	EXPECT_EEXIT(&self->run);
 	EXPECT_EQ(self->run.user_data, 0);
 
-	op.type = ENCL_OP_GET;
-	op.buffer = 0;
+	get_op.header.type = ENCL_OP_GET;
+	get_op.value = 0;
 
-	EXPECT_EQ(ENCL_CALL(&op, &self->run, true), 0);
+	EXPECT_EQ(ENCL_CALL(&get_op, &self->run, true), 0);
 
-	EXPECT_EQ(op.buffer, MAGIC);
+	EXPECT_EQ(get_op.value, MAGIC);
 	EXPECT_EEXIT(&self->run);
 	EXPECT_EQ(self->run.user_data, 0);
 }
@@ -357,7 +360,8 @@ static int test_handler(long rdi, long rsi, long rdx, long ursp, long r8, long r
 
 TEST_F(enclave, clobbered_vdso_and_user_function)
 {
-	struct encl_op op;
+	struct encl_op_put put_op;
+	struct encl_op_get get_op;
 
 	ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata));
 
@@ -367,20 +371,20 @@ TEST_F(enclave, clobbered_vdso_and_user_function)
 	self->run.user_handler = (__u64)test_handler;
 	self->run.user_data = 0xdeadbeef;
 
-	op.type = ENCL_OP_PUT;
-	op.buffer = MAGIC;
+	put_op.header.type = ENCL_OP_PUT;
+	put_op.value = MAGIC;
 
-	EXPECT_EQ(ENCL_CALL(&op, &self->run, true), 0);
+	EXPECT_EQ(ENCL_CALL(&put_op, &self->run, true), 0);
 
 	EXPECT_EEXIT(&self->run);
 	EXPECT_EQ(self->run.user_data, 0);
 
-	op.type = ENCL_OP_GET;
-	op.buffer = 0;
+	get_op.header.type = ENCL_OP_GET;
+	get_op.value = 0;
 
-	EXPECT_EQ(ENCL_CALL(&op, &self->run, true), 0);
+	EXPECT_EQ(ENCL_CALL(&get_op, &self->run, true), 0);
 
-	EXPECT_EQ(op.buffer, MAGIC);
+	EXPECT_EQ(get_op.value, MAGIC);
 	EXPECT_EEXIT(&self->run);
 	EXPECT_EQ(self->run.user_data, 0);
 }
diff --git a/tools/testing/selftests/sgx/test_encl.c b/tools/testing/selftests/sgx/test_encl.c
index 734ea52f9924..f11eb8315704 100644
--- a/tools/testing/selftests/sgx/test_encl.c
+++ b/tools/testing/selftests/sgx/test_encl.c
@@ -16,20 +16,29 @@ static void *memcpy(void *dest, const void *src, size_t n)
 	return dest;
 }
 
-void encl_body(void *rdi,  void *rsi)
+static void do_encl_op_put(void *op)
+{
+	struct encl_op_put *op2 = op;
+
+	memcpy(&encl_buffer[0], &op2->value, 8);
+}
+
+static void do_encl_op_get(void *op)
 {
-	struct encl_op *op = (struct encl_op *)rdi;
+	struct encl_op_get *op2 = op;
 
-	switch (op->type) {
-	case ENCL_OP_PUT:
-		memcpy(&encl_buffer[0], &op->buffer, 8);
-		break;
+	memcpy(&op2->value, &encl_buffer[0], 8);
+}
+
+void encl_body(void *rdi,  void *rsi)
+{
+	const void (*encl_op_array[ENCL_OP_MAX])(void *) = {
+		do_encl_op_put,
+		do_encl_op_get,
+	};
 
-	case ENCL_OP_GET:
-		memcpy(&op->buffer, &encl_buffer[0], 8);
-		break;
+	struct encl_op_header *op = (struct encl_op_header *)rdi;
 
-	default:
-		break;
-	}
+	if (op->type < ENCL_OP_MAX)
+		(*encl_op_array[op->type])(op);
 }
-- 
cgit v1.2.3


From c085dfc7685c8c36698b851b03990b75a3226e97 Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Mon, 15 Nov 2021 10:35:23 -0800
Subject: selftests/sgx: Rename test properties in preparation for more enclave
 tests

SGX selftests prepares a data structure outside of the enclave with
the type of and data for the operation that needs to be run within
the enclave. At this time only two complementary operations are supported
by the enclave: copying a value from outside the enclave into a default
buffer within the enclave and reading a value from the enclave's default
buffer into a variable accessible outside the enclave.

In preparation for more operations supported by the enclave the names of the
current enclave operations are changed to more accurately reflect the
operations and more easily distinguish it from future operations:

* The enums ENCL_OP_PUT and ENCL_OP_GET are renamed to ENCL_OP_PUT_TO_BUFFER
  and ENCL_OP_GET_FROM_BUFFER respectively.
* The structs encl_op_put and encl_op_get are renamed to encl_op_put_to_buf
  and encl_op_get_from_buf respectively.
* The enclave functions do_encl_op_put and do_encl_op_get are renamed to
  do_encl_op_put_to_buf and do_encl_op_get_from_buf respectively.

No functional changes.

Suggested-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/023fda047c787cf330b88ed9337705edae6a0078.1636997631.git.reinette.chatre@intel.com
---
 tools/testing/selftests/sgx/defines.h   |  8 ++++----
 tools/testing/selftests/sgx/main.c      | 32 ++++++++++++++++----------------
 tools/testing/selftests/sgx/test_encl.c | 12 ++++++------
 3 files changed, 26 insertions(+), 26 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/sgx/defines.h b/tools/testing/selftests/sgx/defines.h
index 6ff95a766287..9ea0c7882dfb 100644
--- a/tools/testing/selftests/sgx/defines.h
+++ b/tools/testing/selftests/sgx/defines.h
@@ -19,8 +19,8 @@
 #include "../../../../arch/x86/include/uapi/asm/sgx.h"
 
 enum encl_op_type {
-	ENCL_OP_PUT,
-	ENCL_OP_GET,
+	ENCL_OP_PUT_TO_BUFFER,
+	ENCL_OP_GET_FROM_BUFFER,
 	ENCL_OP_MAX,
 };
 
@@ -28,12 +28,12 @@ struct encl_op_header {
 	uint64_t type;
 };
 
-struct encl_op_put {
+struct encl_op_put_to_buf {
 	struct encl_op_header header;
 	uint64_t value;
 };
 
-struct encl_op_get {
+struct encl_op_get_from_buf {
 	struct encl_op_header header;
 	uint64_t value;
 };
diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c
index 3996c00486b6..61321229e485 100644
--- a/tools/testing/selftests/sgx/main.c
+++ b/tools/testing/selftests/sgx/main.c
@@ -220,15 +220,15 @@ FIXTURE_TEARDOWN(enclave)
 
 TEST_F(enclave, unclobbered_vdso)
 {
-	struct encl_op_put put_op;
-	struct encl_op_get get_op;
+	struct encl_op_get_from_buf get_op;
+	struct encl_op_put_to_buf put_op;
 
 	ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata));
 
 	memset(&self->run, 0, sizeof(self->run));
 	self->run.tcs = self->encl.encl_base;
 
-	put_op.header.type = ENCL_OP_PUT;
+	put_op.header.type = ENCL_OP_PUT_TO_BUFFER;
 	put_op.value = MAGIC;
 
 	EXPECT_EQ(ENCL_CALL(&put_op, &self->run, false), 0);
@@ -236,7 +236,7 @@ TEST_F(enclave, unclobbered_vdso)
 	EXPECT_EEXIT(&self->run);
 	EXPECT_EQ(self->run.user_data, 0);
 
-	get_op.header.type = ENCL_OP_GET;
+	get_op.header.type = ENCL_OP_GET_FROM_BUFFER;
 	get_op.value = 0;
 
 	EXPECT_EQ(ENCL_CALL(&get_op, &self->run, false), 0);
@@ -292,9 +292,9 @@ static unsigned long get_total_epc_mem(void)
 
 TEST_F(enclave, unclobbered_vdso_oversubscribed)
 {
+	struct encl_op_get_from_buf get_op;
+	struct encl_op_put_to_buf put_op;
 	unsigned long total_mem;
-	struct encl_op_put put_op;
-	struct encl_op_get get_op;
 
 	total_mem = get_total_epc_mem();
 	ASSERT_NE(total_mem, 0);
@@ -303,7 +303,7 @@ TEST_F(enclave, unclobbered_vdso_oversubscribed)
 	memset(&self->run, 0, sizeof(self->run));
 	self->run.tcs = self->encl.encl_base;
 
-	put_op.header.type = ENCL_OP_PUT;
+	put_op.header.type = ENCL_OP_PUT_TO_BUFFER;
 	put_op.value = MAGIC;
 
 	EXPECT_EQ(ENCL_CALL(&put_op, &self->run, false), 0);
@@ -311,7 +311,7 @@ TEST_F(enclave, unclobbered_vdso_oversubscribed)
 	EXPECT_EEXIT(&self->run);
 	EXPECT_EQ(self->run.user_data, 0);
 
-	get_op.header.type = ENCL_OP_GET;
+	get_op.header.type = ENCL_OP_GET_FROM_BUFFER;
 	get_op.value = 0;
 
 	EXPECT_EQ(ENCL_CALL(&get_op, &self->run, false), 0);
@@ -324,15 +324,15 @@ TEST_F(enclave, unclobbered_vdso_oversubscribed)
 
 TEST_F(enclave, clobbered_vdso)
 {
-	struct encl_op_put put_op;
-	struct encl_op_get get_op;
+	struct encl_op_get_from_buf get_op;
+	struct encl_op_put_to_buf put_op;
 
 	ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata));
 
 	memset(&self->run, 0, sizeof(self->run));
 	self->run.tcs = self->encl.encl_base;
 
-	put_op.header.type = ENCL_OP_PUT;
+	put_op.header.type = ENCL_OP_PUT_TO_BUFFER;
 	put_op.value = MAGIC;
 
 	EXPECT_EQ(ENCL_CALL(&put_op, &self->run, true), 0);
@@ -340,7 +340,7 @@ TEST_F(enclave, clobbered_vdso)
 	EXPECT_EEXIT(&self->run);
 	EXPECT_EQ(self->run.user_data, 0);
 
-	get_op.header.type = ENCL_OP_GET;
+	get_op.header.type = ENCL_OP_GET_FROM_BUFFER;
 	get_op.value = 0;
 
 	EXPECT_EQ(ENCL_CALL(&get_op, &self->run, true), 0);
@@ -360,8 +360,8 @@ static int test_handler(long rdi, long rsi, long rdx, long ursp, long r8, long r
 
 TEST_F(enclave, clobbered_vdso_and_user_function)
 {
-	struct encl_op_put put_op;
-	struct encl_op_get get_op;
+	struct encl_op_get_from_buf get_op;
+	struct encl_op_put_to_buf put_op;
 
 	ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata));
 
@@ -371,7 +371,7 @@ TEST_F(enclave, clobbered_vdso_and_user_function)
 	self->run.user_handler = (__u64)test_handler;
 	self->run.user_data = 0xdeadbeef;
 
-	put_op.header.type = ENCL_OP_PUT;
+	put_op.header.type = ENCL_OP_PUT_TO_BUFFER;
 	put_op.value = MAGIC;
 
 	EXPECT_EQ(ENCL_CALL(&put_op, &self->run, true), 0);
@@ -379,7 +379,7 @@ TEST_F(enclave, clobbered_vdso_and_user_function)
 	EXPECT_EEXIT(&self->run);
 	EXPECT_EQ(self->run.user_data, 0);
 
-	get_op.header.type = ENCL_OP_GET;
+	get_op.header.type = ENCL_OP_GET_FROM_BUFFER;
 	get_op.value = 0;
 
 	EXPECT_EQ(ENCL_CALL(&get_op, &self->run, true), 0);
diff --git a/tools/testing/selftests/sgx/test_encl.c b/tools/testing/selftests/sgx/test_encl.c
index f11eb8315704..4e8da738173f 100644
--- a/tools/testing/selftests/sgx/test_encl.c
+++ b/tools/testing/selftests/sgx/test_encl.c
@@ -16,16 +16,16 @@ static void *memcpy(void *dest, const void *src, size_t n)
 	return dest;
 }
 
-static void do_encl_op_put(void *op)
+static void do_encl_op_put_to_buf(void *op)
 {
-	struct encl_op_put *op2 = op;
+	struct encl_op_put_to_buf *op2 = op;
 
 	memcpy(&encl_buffer[0], &op2->value, 8);
 }
 
-static void do_encl_op_get(void *op)
+static void do_encl_op_get_from_buf(void *op)
 {
-	struct encl_op_get *op2 = op;
+	struct encl_op_get_from_buf *op2 = op;
 
 	memcpy(&op2->value, &encl_buffer[0], 8);
 }
@@ -33,8 +33,8 @@ static void do_encl_op_get(void *op)
 void encl_body(void *rdi,  void *rsi)
 {
 	const void (*encl_op_array[ENCL_OP_MAX])(void *) = {
-		do_encl_op_put,
-		do_encl_op_get,
+		do_encl_op_put_to_buf,
+		do_encl_op_get_from_buf,
 	};
 
 	struct encl_op_header *op = (struct encl_op_header *)rdi;
-- 
cgit v1.2.3


From abc5cec4735080d12d644c2d39f96cf98c0a367c Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Mon, 15 Nov 2021 10:35:24 -0800
Subject: selftests/sgx: Add page permission and exception test

The Enclave Page Cache Map (EPCM) is a secure structure used by the
processor to track the contents of the enclave page cache. The EPCM
contains permissions with which enclave pages can be accessed. SGX
support allows EPCM and PTE page permissions to differ - as long as
the PTE permissions do not exceed the EPCM permissions.

Add a test that:
(1) Creates an SGX enclave page with writable EPCM permission.
(2) Changes the PTE permission on the page to read-only. This should
    be permitted because the permission does not exceed the EPCM
    permission.
(3) Attempts a write to the page. This should generate a page fault
    (#PF) because of the read-only PTE even though the EPCM
    permissions allow the page to be written to.

This introduces the first test of SGX exception handling. In this test
the issue that caused the exception (PTE page permissions) can be fixed
from outside the enclave and after doing so it is possible to re-enter
enclave at original entrypoint with ERESUME.

Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/3bcc73a4b9fe8780bdb40571805e7ced59e01df7.1636997631.git.reinette.chatre@intel.com
---
 tools/testing/selftests/sgx/defines.h   |  14 ++++
 tools/testing/selftests/sgx/main.c      | 134 ++++++++++++++++++++++++++++++++
 tools/testing/selftests/sgx/test_encl.c |  21 +++++
 3 files changed, 169 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/sgx/defines.h b/tools/testing/selftests/sgx/defines.h
index 9ea0c7882dfb..0bbda6f0c7d3 100644
--- a/tools/testing/selftests/sgx/defines.h
+++ b/tools/testing/selftests/sgx/defines.h
@@ -21,6 +21,8 @@
 enum encl_op_type {
 	ENCL_OP_PUT_TO_BUFFER,
 	ENCL_OP_GET_FROM_BUFFER,
+	ENCL_OP_PUT_TO_ADDRESS,
+	ENCL_OP_GET_FROM_ADDRESS,
 	ENCL_OP_MAX,
 };
 
@@ -38,4 +40,16 @@ struct encl_op_get_from_buf {
 	uint64_t value;
 };
 
+struct encl_op_put_to_addr {
+	struct encl_op_header header;
+	uint64_t value;
+	uint64_t addr;
+};
+
+struct encl_op_get_from_addr {
+	struct encl_op_header header;
+	uint64_t value;
+	uint64_t addr;
+};
+
 #endif /* DEFINES_H */
diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c
index 61321229e485..1b4858f8ca4e 100644
--- a/tools/testing/selftests/sgx/main.c
+++ b/tools/testing/selftests/sgx/main.c
@@ -21,6 +21,7 @@
 #include "main.h"
 
 static const uint64_t MAGIC = 0x1122334455667788ULL;
+static const uint64_t MAGIC2 = 0x8877665544332211ULL;
 vdso_sgx_enter_enclave_t vdso_sgx_enter_enclave;
 
 struct vdso_symtab {
@@ -107,6 +108,25 @@ static Elf64_Sym *vdso_symtab_get(struct vdso_symtab *symtab, const char *name)
 	return NULL;
 }
 
+/*
+ * Return the offset in the enclave where the data segment can be found.
+ * The first RW segment loaded is the TCS, skip that to get info on the
+ * data segment.
+ */
+static off_t encl_get_data_offset(struct encl *encl)
+{
+	int i;
+
+	for (i = 1; i < encl->nr_segments; i++) {
+		struct encl_segment *seg = &encl->segment_tbl[i];
+
+		if (seg->prot == (PROT_READ | PROT_WRITE))
+			return seg->offset;
+	}
+
+	return -1;
+}
+
 FIXTURE(enclave) {
 	struct encl encl;
 	struct sgx_enclave_run run;
@@ -389,4 +409,118 @@ TEST_F(enclave, clobbered_vdso_and_user_function)
 	EXPECT_EQ(self->run.user_data, 0);
 }
 
+/*
+ * Second page of .data segment is used to test changing PTE permissions.
+ * This spans the local encl_buffer within the test enclave.
+ *
+ * 1) Start with a sanity check: a value is written to the target page within
+ *    the enclave and read back to ensure target page can be written to.
+ * 2) Change PTE permissions (RW -> RO) of target page within enclave.
+ * 3) Repeat (1) - this time expecting a regular #PF communicated via the
+ *    vDSO.
+ * 4) Change PTE permissions of target page within enclave back to be RW.
+ * 5) Repeat (1) by resuming enclave, now expected to be possible to write to
+ *    and read from target page within enclave.
+ */
+TEST_F(enclave, pte_permissions)
+{
+	struct encl_op_get_from_addr get_addr_op;
+	struct encl_op_put_to_addr put_addr_op;
+	unsigned long data_start;
+	int ret;
+
+	ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata));
+
+	memset(&self->run, 0, sizeof(self->run));
+	self->run.tcs = self->encl.encl_base;
+
+	data_start = self->encl.encl_base +
+		     encl_get_data_offset(&self->encl) +
+		     PAGE_SIZE;
+
+	/*
+	 * Sanity check to ensure it is possible to write to page that will
+	 * have its permissions manipulated.
+	 */
+
+	/* Write MAGIC to page */
+	put_addr_op.value = MAGIC;
+	put_addr_op.addr = data_start;
+	put_addr_op.header.type = ENCL_OP_PUT_TO_ADDRESS;
+
+	EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0);
+
+	EXPECT_EEXIT(&self->run);
+	EXPECT_EQ(self->run.exception_vector, 0);
+	EXPECT_EQ(self->run.exception_error_code, 0);
+	EXPECT_EQ(self->run.exception_addr, 0);
+
+	/*
+	 * Read memory that was just written to, confirming that it is the
+	 * value previously written (MAGIC).
+	 */
+	get_addr_op.value = 0;
+	get_addr_op.addr = data_start;
+	get_addr_op.header.type = ENCL_OP_GET_FROM_ADDRESS;
+
+	EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0);
+
+	EXPECT_EQ(get_addr_op.value, MAGIC);
+	EXPECT_EEXIT(&self->run);
+	EXPECT_EQ(self->run.exception_vector, 0);
+	EXPECT_EQ(self->run.exception_error_code, 0);
+	EXPECT_EQ(self->run.exception_addr, 0);
+
+	/* Change PTE permissions of target page within the enclave */
+	ret = mprotect((void *)data_start, PAGE_SIZE, PROT_READ);
+	if (ret)
+		perror("mprotect");
+
+	/*
+	 * PTE permissions of target page changed to read-only, EPCM
+	 * permissions unchanged (EPCM permissions are RW), attempt to
+	 * write to the page, expecting a regular #PF.
+	 */
+
+	put_addr_op.value = MAGIC2;
+
+	EXPECT_EQ(ENCL_CALL(&put_addr_op, &self->run, true), 0);
+
+	EXPECT_EQ(self->run.exception_vector, 14);
+	EXPECT_EQ(self->run.exception_error_code, 0x7);
+	EXPECT_EQ(self->run.exception_addr, data_start);
+
+	self->run.exception_vector = 0;
+	self->run.exception_error_code = 0;
+	self->run.exception_addr = 0;
+
+	/*
+	 * Change PTE permissions back to enable enclave to write to the
+	 * target page and resume enclave - do not expect any exceptions this
+	 * time.
+	 */
+	ret = mprotect((void *)data_start, PAGE_SIZE, PROT_READ | PROT_WRITE);
+	if (ret)
+		perror("mprotect");
+
+	EXPECT_EQ(vdso_sgx_enter_enclave((unsigned long)&put_addr_op, 0,
+					 0, ERESUME, 0, 0, &self->run),
+		 0);
+
+	EXPECT_EEXIT(&self->run);
+	EXPECT_EQ(self->run.exception_vector, 0);
+	EXPECT_EQ(self->run.exception_error_code, 0);
+	EXPECT_EQ(self->run.exception_addr, 0);
+
+	get_addr_op.value = 0;
+
+	EXPECT_EQ(ENCL_CALL(&get_addr_op, &self->run, true), 0);
+
+	EXPECT_EQ(get_addr_op.value, MAGIC2);
+	EXPECT_EEXIT(&self->run);
+	EXPECT_EQ(self->run.exception_vector, 0);
+	EXPECT_EQ(self->run.exception_error_code, 0);
+	EXPECT_EQ(self->run.exception_addr, 0);
+}
+
 TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/sgx/test_encl.c b/tools/testing/selftests/sgx/test_encl.c
index 4e8da738173f..5d86e3e6456a 100644
--- a/tools/testing/selftests/sgx/test_encl.c
+++ b/tools/testing/selftests/sgx/test_encl.c
@@ -4,6 +4,11 @@
 #include <stddef.h>
 #include "defines.h"
 
+/*
+ * Data buffer spanning two pages that will be placed first in .data
+ * segment. Even if not used internally the second page is needed by
+ * external test manipulating page permissions.
+ */
 static uint8_t encl_buffer[8192] = { 1 };
 
 static void *memcpy(void *dest, const void *src, size_t n)
@@ -30,11 +35,27 @@ static void do_encl_op_get_from_buf(void *op)
 	memcpy(&op2->value, &encl_buffer[0], 8);
 }
 
+static void do_encl_op_put_to_addr(void *_op)
+{
+	struct encl_op_put_to_addr *op = _op;
+
+	memcpy((void *)op->addr, &op->value, 8);
+}
+
+static void do_encl_op_get_from_addr(void *_op)
+{
+	struct encl_op_get_from_addr *op = _op;
+
+	memcpy(&op->value, (void *)op->addr, 8);
+}
+
 void encl_body(void *rdi,  void *rsi)
 {
 	const void (*encl_op_array[ENCL_OP_MAX])(void *) = {
 		do_encl_op_put_to_buf,
 		do_encl_op_get_from_buf,
+		do_encl_op_put_to_addr,
+		do_encl_op_get_from_addr,
 	};
 
 	struct encl_op_header *op = (struct encl_op_header *)rdi;
-- 
cgit v1.2.3


From 26e688f1263a3c226f3bb5e3441c310ae11e8001 Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Mon, 15 Nov 2021 10:35:25 -0800
Subject: selftests/sgx: Enable multiple thread support

Each thread executing in an enclave is associated with a Thread Control
Structure (TCS). The test enclave contains two hardcoded TCS. Each TCS
contains meta-data used by the hardware to save and restore thread specific
information when entering/exiting the enclave.

The two TCS structures within the test enclave share their SSA (State Save
Area) resulting in the threads clobbering each other's data. Fix this by
providing each TCS their own SSA area.

Additionally, there is an 8K stack space and its address is
computed from the enclave entry point which is correctly done for
TCS #1 that starts on the first address inside the enclave but
results in out of bounds memory when entering as TCS #2. Split 8K
stack space into two separate pages with offset symbol between to ensure
the current enclave entry calculation can continue to be used for both
threads.

While using the enclave with multiple threads requires these fixes the
impact is not apparent because every test up to this point enters the
enclave from the first TCS.

More detail about the stack fix:
-------------------------------
Before this change the test enclave (test_encl) looks as follows:

.tcs (2 pages):
(page 1) TCS #1
(page 2) TCS #2

.text (1 page)
One page of code

.data (5 pages)
(page 1) encl_buffer
(page 2) encl_buffer
(page 3) SSA
(page 4 and 5) STACK
encl_stack:

As shown above there is a symbol, encl_stack, that points to the end of the
.data segment (pointing to the end of page 5 in .data) which is also the
end of the enclave.

The enclave entry code computes the stack address by adding encl_stack to
the pointer to the TCS that entered the enclave. When entering at TCS #1
the stack is computed correctly but when entering at TCS #2 the stack
pointer would point to one page beyond the end of the enclave and a #PF
would result when TCS #2 attempts to enter the enclave.

The fix involves moving the encl_stack symbol between the two stack pages.
Doing so enables the stack address computation in the entry code to compute
the correct stack address for each TCS.

Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/a49dc0d85401db788a0a3f0d795e848abf3b1f44.1636997631.git.reinette.chatre@intel.com
---
 tools/testing/selftests/sgx/test_encl_bootstrap.S | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/sgx/test_encl_bootstrap.S b/tools/testing/selftests/sgx/test_encl_bootstrap.S
index 5d5680d4ea39..82fb0dfcbd23 100644
--- a/tools/testing/selftests/sgx/test_encl_bootstrap.S
+++ b/tools/testing/selftests/sgx/test_encl_bootstrap.S
@@ -12,7 +12,7 @@
 
 	.fill	1, 8, 0			# STATE (set by CPU)
 	.fill	1, 8, 0			# FLAGS
-	.quad	encl_ssa		# OSSA
+	.quad	encl_ssa_tcs1		# OSSA
 	.fill	1, 4, 0			# CSSA (set by CPU)
 	.fill	1, 4, 1			# NSSA
 	.quad	encl_entry		# OENTRY
@@ -23,10 +23,10 @@
 	.fill	1, 4, 0xFFFFFFFF	# GSLIMIT
 	.fill	4024, 1, 0		# Reserved
 
-	# Identical to the previous TCS.
+	# TCS2
 	.fill	1, 8, 0			# STATE (set by CPU)
 	.fill	1, 8, 0			# FLAGS
-	.quad	encl_ssa		# OSSA
+	.quad	encl_ssa_tcs2		# OSSA
 	.fill	1, 4, 0			# CSSA (set by CPU)
 	.fill	1, 4, 1			# NSSA
 	.quad	encl_entry		# OENTRY
@@ -40,8 +40,9 @@
 	.text
 
 encl_entry:
-	# RBX contains the base address for TCS, which is also the first address
-	# inside the enclave. By adding the value of le_stack_end to it, we get
+	# RBX contains the base address for TCS, which is the first address
+	# inside the enclave for TCS #1 and one page into the enclave for
+	# TCS #2. By adding the value of encl_stack to it, we get
 	# the absolute address for the stack.
 	lea	(encl_stack)(%rbx), %rax
 	xchg	%rsp, %rax
@@ -81,9 +82,15 @@ encl_entry:
 
 	.section ".data", "aw"
 
-encl_ssa:
+encl_ssa_tcs1:
+	.space 4096
+encl_ssa_tcs2:
 	.space 4096
 
 	.balign 4096
-	.space 8192
+	# Stack of TCS #1
+	.space 4096
 encl_stack:
+	.balign 4096
+	# Stack of TCS #2
+	.space 4096
-- 
cgit v1.2.3


From 688542e29fae655a8be25832f6a9959bdd308dd8 Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Mon, 15 Nov 2021 10:35:26 -0800
Subject: selftests/sgx: Add test for multiple TCS entry

Each thread executing in an enclave is associated with a Thread Control
Structure (TCS). The SGX test enclave contains two hardcoded TCS, thus
supporting two threads in the enclave.

Add a test to ensure it is possible to enter enclave at both entrypoints.

Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/7be151a57b4c7959a2364753b995e0006efa3da1.1636997631.git.reinette.chatre@intel.com
---
 tools/testing/selftests/sgx/defines.h   |  1 +
 tools/testing/selftests/sgx/main.c      | 32 ++++++++++++++++++++++++++++++++
 tools/testing/selftests/sgx/test_encl.c |  6 ++++++
 3 files changed, 39 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/sgx/defines.h b/tools/testing/selftests/sgx/defines.h
index 0bbda6f0c7d3..02d775789ea7 100644
--- a/tools/testing/selftests/sgx/defines.h
+++ b/tools/testing/selftests/sgx/defines.h
@@ -23,6 +23,7 @@ enum encl_op_type {
 	ENCL_OP_GET_FROM_BUFFER,
 	ENCL_OP_PUT_TO_ADDRESS,
 	ENCL_OP_GET_FROM_ADDRESS,
+	ENCL_OP_NOP,
 	ENCL_OP_MAX,
 };
 
diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c
index 1b4858f8ca4e..7e912db4c6c5 100644
--- a/tools/testing/selftests/sgx/main.c
+++ b/tools/testing/selftests/sgx/main.c
@@ -409,6 +409,38 @@ TEST_F(enclave, clobbered_vdso_and_user_function)
 	EXPECT_EQ(self->run.user_data, 0);
 }
 
+/*
+ * Sanity check that it is possible to enter either of the two hardcoded TCS
+ */
+TEST_F(enclave, tcs_entry)
+{
+	struct encl_op_header op;
+
+	ASSERT_TRUE(setup_test_encl(ENCL_HEAP_SIZE_DEFAULT, &self->encl, _metadata));
+
+	memset(&self->run, 0, sizeof(self->run));
+	self->run.tcs = self->encl.encl_base;
+
+	op.type = ENCL_OP_NOP;
+
+	EXPECT_EQ(ENCL_CALL(&op, &self->run, true), 0);
+
+	EXPECT_EEXIT(&self->run);
+	EXPECT_EQ(self->run.exception_vector, 0);
+	EXPECT_EQ(self->run.exception_error_code, 0);
+	EXPECT_EQ(self->run.exception_addr, 0);
+
+	/* Move to the next TCS. */
+	self->run.tcs = self->encl.encl_base + PAGE_SIZE;
+
+	EXPECT_EQ(ENCL_CALL(&op, &self->run, true), 0);
+
+	EXPECT_EEXIT(&self->run);
+	EXPECT_EQ(self->run.exception_vector, 0);
+	EXPECT_EQ(self->run.exception_error_code, 0);
+	EXPECT_EQ(self->run.exception_addr, 0);
+}
+
 /*
  * Second page of .data segment is used to test changing PTE permissions.
  * This spans the local encl_buffer within the test enclave.
diff --git a/tools/testing/selftests/sgx/test_encl.c b/tools/testing/selftests/sgx/test_encl.c
index 5d86e3e6456a..4fca01cfd898 100644
--- a/tools/testing/selftests/sgx/test_encl.c
+++ b/tools/testing/selftests/sgx/test_encl.c
@@ -49,6 +49,11 @@ static void do_encl_op_get_from_addr(void *_op)
 	memcpy(&op->value, (void *)op->addr, 8);
 }
 
+static void do_encl_op_nop(void *_op)
+{
+
+}
+
 void encl_body(void *rdi,  void *rsi)
 {
 	const void (*encl_op_array[ENCL_OP_MAX])(void *) = {
@@ -56,6 +61,7 @@ void encl_body(void *rdi,  void *rsi)
 		do_encl_op_get_from_buf,
 		do_encl_op_put_to_addr,
 		do_encl_op_get_from_addr,
+		do_encl_op_nop,
 	};
 
 	struct encl_op_header *op = (struct encl_op_header *)rdi;
-- 
cgit v1.2.3


From e47d0bf800e8d7f4de501987b2788c7f2ce22cd1 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Mon, 15 Nov 2021 16:04:48 -0800
Subject: bpftool: Add current libbpf_strict mode to version output

+ bpftool --legacy --version
bpftool v5.15.0
features: libbfd, skeletons
+ bpftool --version
bpftool v5.15.0
features: libbfd, libbpf_strict, skeletons

+ bpftool --legacy --help
Usage: bpftool [OPTIONS] OBJECT { COMMAND | help }
       bpftool batch file FILE
       bpftool version

       OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops | iter }
       OPTIONS := { {-j|--json} [{-p|--pretty}] | {-d|--debug} | {-l|--legacy} |
                    {-V|--version} }
+ bpftool --help
Usage: bpftool [OPTIONS] OBJECT { COMMAND | help }
       bpftool batch file FILE
       bpftool version

       OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops | iter }
       OPTIONS := { {-j|--json} [{-p|--pretty}] | {-d|--debug} | {-l|--legacy} |
                    {-V|--version} }

+ bpftool --legacy
Usage: bpftool [OPTIONS] OBJECT { COMMAND | help }
       bpftool batch file FILE
       bpftool version

       OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops | iter }
       OPTIONS := { {-j|--json} [{-p|--pretty}] | {-d|--debug} | {-l|--legacy} |
                    {-V|--version} }
+ bpftool
Usage: bpftool [OPTIONS] OBJECT { COMMAND | help }
       bpftool batch file FILE
       bpftool version

       OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops | iter }
       OPTIONS := { {-j|--json} [{-p|--pretty}] | {-d|--debug} | {-l|--legacy} |
                    {-V|--version} }

+ bpftool --legacy version
bpftool v5.15.0
features: libbfd, skeletons
+ bpftool version
bpftool v5.15.0
features: libbfd, libbpf_strict, skeletons

+ bpftool --json --legacy version
{"version":"5.15.0","features":{"libbfd":true,"libbpf_strict":false,"skeletons":true}}
+ bpftool --json version
{"version":"5.15.0","features":{"libbfd":true,"libbpf_strict":true,"skeletons":true}}

Suggested-by: Quentin Monnet <quentin@isovalent.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Quentin Monnet <quentin@isovalent.com>
Link: https://lore.kernel.org/bpf/20211116000448.2918854-1-sdf@google.com
---
 tools/bpf/bpftool/main.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
index 473791e87f7d..8b71500e7cb2 100644
--- a/tools/bpf/bpftool/main.c
+++ b/tools/bpf/bpftool/main.c
@@ -93,6 +93,7 @@ static int do_version(int argc, char **argv)
 		jsonw_name(json_wtr, "features");
 		jsonw_start_object(json_wtr);	/* features */
 		jsonw_bool_field(json_wtr, "libbfd", has_libbfd);
+		jsonw_bool_field(json_wtr, "libbpf_strict", !legacy_libbpf);
 		jsonw_bool_field(json_wtr, "skeletons", has_skeletons);
 		jsonw_end_object(json_wtr);	/* features */
 
@@ -106,6 +107,10 @@ static int do_version(int argc, char **argv)
 			printf(" libbfd");
 			nb_features++;
 		}
+		if (!legacy_libbpf) {
+			printf("%s libbpf_strict", nb_features++ ? "," : "");
+			nb_features++;
+		}
 		if (has_skeletons)
 			printf("%s skeletons", nb_features++ ? "," : "");
 		printf("\n");
@@ -400,6 +405,7 @@ int main(int argc, char **argv)
 		{ "legacy",	no_argument,	NULL,	'l' },
 		{ 0 }
 	};
+	bool version_requested = false;
 	int opt, ret;
 
 	last_do_help = do_help;
@@ -414,7 +420,8 @@ int main(int argc, char **argv)
 				  options, NULL)) >= 0) {
 		switch (opt) {
 		case 'V':
-			return do_version(argc, argv);
+			version_requested = true;
+			break;
 		case 'h':
 			return do_help(argc, argv);
 		case 'p':
@@ -479,6 +486,9 @@ int main(int argc, char **argv)
 	if (argc < 0)
 		usage();
 
+	if (version_requested)
+		return do_version(argc, argv);
+
 	ret = cmd_select(cmds, argc, argv, do_help);
 
 	if (json_output)
-- 
cgit v1.2.3


From 69a055d546156adc6f7727ec981f721d5ba9231a Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 15 Nov 2021 08:39:37 -0800
Subject: libbpf: Fix a couple of missed btf_type_tag handling in btf.c

Commit 2dc1e488e5cd ("libbpf: Support BTF_KIND_TYPE_TAG") added the
BTF_KIND_TYPE_TAG support. But to test vmlinux build with ...

  #define __user __attribute__((btf_type_tag("user")))

... I needed to sync libbpf repo and manually copy libbpf sources to
pahole. To simplify process, I used BTF_KIND_RESTRICT to simulate the
BTF_KIND_TYPE_TAG with vmlinux build as "restrict" modifier is barely
used in kernel.

But this approach missed one case in dedup with structures where
BTF_KIND_RESTRICT is handled and BTF_KIND_TYPE_TAG is not handled in
btf_dedup_is_equiv(), and this will result in a pahole dedup failure.
This patch fixed this issue and a selftest is added in the subsequent
patch to test this scenario.

The other missed handling is in btf__resolve_size(). Currently the compiler
always emit like PTR->TYPE_TAG->... so in practice we don't hit the missing
BTF_KIND_TYPE_TAG handling issue with compiler generated code. But lets
add case BTF_KIND_TYPE_TAG in the switch statement to be future proof.

Fixes: 2dc1e488e5cd ("libbpf: Support BTF_KIND_TYPE_TAG")
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211115163937.3922235-1-yhs@fb.com
---
 tools/lib/bpf/btf.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index fadf089ae8fe..b6be579e0dc6 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -610,6 +610,7 @@ __s64 btf__resolve_size(const struct btf *btf, __u32 type_id)
 		case BTF_KIND_RESTRICT:
 		case BTF_KIND_VAR:
 		case BTF_KIND_DECL_TAG:
+		case BTF_KIND_TYPE_TAG:
 			type_id = t->type;
 			break;
 		case BTF_KIND_ARRAY:
@@ -4023,6 +4024,7 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id,
 	case BTF_KIND_PTR:
 	case BTF_KIND_TYPEDEF:
 	case BTF_KIND_FUNC:
+	case BTF_KIND_TYPE_TAG:
 		if (cand_type->info != canon_type->info)
 			return 0;
 		return btf_dedup_is_equiv(d, cand_type->type, canon_type->type);
-- 
cgit v1.2.3


From 4746158305e98c91c479539d53ef9bf8c520dd66 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 15 Nov 2021 08:39:43 -0800
Subject: selftests/bpf: Add a dedup selftest with equivalent structure types

Without previous libbpf patch, the following error will occur:

  $ ./test_progs -t btf
  ...
  do_test_dedup:FAIL:check btf_dedup failed errno:-22#13/205 btf/dedup: btf_type_tag #5, struct:FAIL

And the previous libbpf patch fixed the issue.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211115163943.3922547-1-yhs@fb.com
---
 tools/testing/selftests/bpf/prog_tests/btf.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c
index 4aa6343dc4c8..f9326a13badb 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf.c
@@ -7352,6 +7352,32 @@ static struct btf_dedup_test dedup_tests[] = {
 		BTF_STR_SEC("\0tag1"),
 	},
 },
+{
+	.descr = "dedup: btf_type_tag #5, struct",
+	.input = {
+		.raw_types = {
+			BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),				/* [1] */
+			BTF_TYPE_TAG_ENC(NAME_NTH(1), 1),					/* [2] */
+			BTF_TYPE_ENC(NAME_NTH(2), BTF_INFO_ENC(BTF_KIND_STRUCT, 1, 1), 4),	/* [3] */
+			BTF_MEMBER_ENC(NAME_NTH(3), 2, BTF_MEMBER_OFFSET(0, 0)),
+			BTF_TYPE_TAG_ENC(NAME_NTH(1), 1),					/* [4] */
+			BTF_TYPE_ENC(NAME_NTH(2), BTF_INFO_ENC(BTF_KIND_STRUCT, 1, 1), 4),	/* [5] */
+			BTF_MEMBER_ENC(NAME_NTH(3), 4, BTF_MEMBER_OFFSET(0, 0)),
+			BTF_END_RAW,
+		},
+		BTF_STR_SEC("\0tag1\0t\0m"),
+	},
+	.expect = {
+		.raw_types = {
+			BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),				/* [1] */
+			BTF_TYPE_TAG_ENC(NAME_NTH(1), 1),					/* [2] */
+			BTF_TYPE_ENC(NAME_NTH(2), BTF_INFO_ENC(BTF_KIND_STRUCT, 1, 1), 4),	/* [3] */
+			BTF_MEMBER_ENC(NAME_NTH(3), 2, BTF_MEMBER_OFFSET(0, 0)),
+			BTF_END_RAW,
+		},
+		BTF_STR_SEC("\0tag1\0t\0m"),
+	},
+},
 
 };
 
-- 
cgit v1.2.3


From 4344842836e9b9a7b695dc84956cdecd83ac02e9 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin@isovalent.com>
Date: Mon, 15 Nov 2021 22:58:42 +0000
Subject: bpftool: Add SPDX tags to RST documentation files

Most files in the kernel repository have a SPDX tags. The files that
don't have such a tag (or another license boilerplate) tend to fall
under the GPL-2.0 license. In the past, bpftool's Makefile (for example)
has been marked as GPL-2.0 for that reason, when in fact all bpftool is
dual-licensed.

To prevent a similar confusion from happening with the RST documentation
files for bpftool, let's explicitly mark all files as dual-licensed.

Signed-off-by: Quentin Monnet <quentin@isovalent.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211115225844.33943-2-quentin@isovalent.com
---
 tools/bpf/bpftool/Documentation/Makefile               | 2 +-
 tools/bpf/bpftool/Documentation/bpftool-btf.rst        | 2 ++
 tools/bpf/bpftool/Documentation/bpftool-cgroup.rst     | 2 ++
 tools/bpf/bpftool/Documentation/bpftool-feature.rst    | 2 ++
 tools/bpf/bpftool/Documentation/bpftool-gen.rst        | 2 ++
 tools/bpf/bpftool/Documentation/bpftool-iter.rst       | 2 ++
 tools/bpf/bpftool/Documentation/bpftool-link.rst       | 2 ++
 tools/bpf/bpftool/Documentation/bpftool-map.rst        | 2 ++
 tools/bpf/bpftool/Documentation/bpftool-net.rst        | 2 ++
 tools/bpf/bpftool/Documentation/bpftool-perf.rst       | 2 ++
 tools/bpf/bpftool/Documentation/bpftool-prog.rst       | 2 ++
 tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst | 2 ++
 tools/bpf/bpftool/Documentation/bpftool.rst            | 2 ++
 tools/bpf/bpftool/Documentation/common_options.rst     | 2 ++
 14 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/Documentation/Makefile b/tools/bpf/bpftool/Documentation/Makefile
index 692e1b947490..ac8487dcff1d 100644
--- a/tools/bpf/bpftool/Documentation/Makefile
+++ b/tools/bpf/bpftool/Documentation/Makefile
@@ -24,7 +24,7 @@ man: man8
 man8: $(DOC_MAN8)
 
 RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null)
-RST2MAN_OPTS += --verbose
+RST2MAN_OPTS += --verbose --strip-comments
 
 list_pages = $(sort $(basename $(filter-out $(1),$(MAN8_RST))))
 see_also = $(subst " ",, \
diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
index 4425d942dd39..2d2ceb7163f6 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
@@ -1,3 +1,5 @@
+.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+
 ================
 bpftool-btf
 ================
diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
index 8069d37dd991..b954faeb0f07 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
@@ -1,3 +1,5 @@
+.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+
 ================
 bpftool-cgroup
 ================
diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst
index ab9f57ee4c3a..b1471788a15f 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst
@@ -1,3 +1,5 @@
+.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+
 ===============
 bpftool-feature
 ===============
diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst
index 2a137f8a4cea..51e2e8de5208 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst
@@ -1,3 +1,5 @@
+.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+
 ================
 bpftool-gen
 ================
diff --git a/tools/bpf/bpftool/Documentation/bpftool-iter.rst b/tools/bpf/bpftool/Documentation/bpftool-iter.rst
index 471f363a725a..51914c9e8a54 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-iter.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-iter.rst
@@ -1,3 +1,5 @@
+.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+
 ============
 bpftool-iter
 ============
diff --git a/tools/bpf/bpftool/Documentation/bpftool-link.rst b/tools/bpf/bpftool/Documentation/bpftool-link.rst
index 9434349636a5..31371bcf605a 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-link.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-link.rst
@@ -1,3 +1,5 @@
+.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+
 ================
 bpftool-link
 ================
diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst
index 991d18fd84f2..e22c918c069c 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
@@ -1,3 +1,5 @@
+.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+
 ================
 bpftool-map
 ================
diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst
index 7ec57535a7c1..6d1aa374529f 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-net.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst
@@ -1,3 +1,5 @@
+.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+
 ================
 bpftool-net
 ================
diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
index ce52798a917d..ad554806faa2 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-perf.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
@@ -1,3 +1,5 @@
+.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+
 ================
 bpftool-perf
 ================
diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index f27265bd589b..d31148571403 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -1,3 +1,5 @@
+.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+
 ================
 bpftool-prog
 ================
diff --git a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst
index 02afc0fc14cb..77b845b5ac61 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst
@@ -1,3 +1,5 @@
+.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+
 ==================
 bpftool-struct_ops
 ==================
diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst
index 8ac86565c501..1248b35e67ae 100644
--- a/tools/bpf/bpftool/Documentation/bpftool.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool.rst
@@ -1,3 +1,5 @@
+.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+
 ================
 BPFTOOL
 ================
diff --git a/tools/bpf/bpftool/Documentation/common_options.rst b/tools/bpf/bpftool/Documentation/common_options.rst
index 75adf23202d8..908487b9c2ad 100644
--- a/tools/bpf/bpftool/Documentation/common_options.rst
+++ b/tools/bpf/bpftool/Documentation/common_options.rst
@@ -1,3 +1,5 @@
+.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+
 -h, --help
 	  Print short help message (similar to **bpftool help**).
 
-- 
cgit v1.2.3


From b623181520404ef48f7421333561bd294c6c6b11 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin@isovalent.com>
Date: Mon, 15 Nov 2021 22:58:43 +0000
Subject: bpftool: Update doc (use susbtitutions) and test_bpftool_synctypes.py

test_bpftool_synctypes.py helps detecting inconsistencies in bpftool
between the different list of types and options scattered in the
sources, the documentation, and the bash completion. For options that
apply to all bpftool commands, the script had a hardcoded list of
values, and would use them to check whether the man pages are
up-to-date. When writing the script, it felt acceptable to have this
list in order to avoid to open and parse bpftool's main.h every time,
and because the list of global options in bpftool doesn't change so
often.

However, this is prone to omissions, and we recently added a new
-l|--legacy option which was described in common_options.rst, but not
listed in the options summary of each manual page. The script did not
complain, because it keeps comparing the hardcoded list to the (now)
outdated list in the header file.

To address the issue, this commit brings the following changes:

- Options that are common to all bpftool commands (--json, --pretty, and
  --debug) are moved to a dedicated file, and used in the definition of
  a RST substitution. This substitution is used in the sources of all
  the man pages.

- This list of common options is updated, with the addition of the new
  -l|--legacy option.

- The script test_bpftool_synctypes.py is updated to compare:
    - Options specific to a command, found in C files, for the
      interactive help messages, with the same specific options from the
      relevant man page for that command.
    - Common options, checked just once: the list in main.h is
      compared with the new list in substitutions.rst.

Signed-off-by: Quentin Monnet <quentin@isovalent.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211115225844.33943-3-quentin@isovalent.com
---
 tools/bpf/bpftool/Documentation/bpftool-btf.rst    |  5 +-
 tools/bpf/bpftool/Documentation/bpftool-cgroup.rst |  5 +-
 .../bpf/bpftool/Documentation/bpftool-feature.rst  |  4 +-
 tools/bpf/bpftool/Documentation/bpftool-gen.rst    |  5 +-
 tools/bpf/bpftool/Documentation/bpftool-iter.rst   |  4 +-
 tools/bpf/bpftool/Documentation/bpftool-link.rst   |  5 +-
 tools/bpf/bpftool/Documentation/bpftool-map.rst    |  5 +-
 tools/bpf/bpftool/Documentation/bpftool-net.rst    |  4 +-
 tools/bpf/bpftool/Documentation/bpftool-perf.rst   |  4 +-
 tools/bpf/bpftool/Documentation/bpftool-prog.rst   |  4 +-
 .../bpftool/Documentation/bpftool-struct_ops.rst   |  4 +-
 tools/bpf/bpftool/Documentation/bpftool.rst        |  5 +-
 tools/bpf/bpftool/Documentation/substitutions.rst  |  3 +
 .../selftests/bpf/test_bpftool_synctypes.py        | 70 +++++++++++++++++++---
 14 files changed, 102 insertions(+), 25 deletions(-)
 create mode 100644 tools/bpf/bpftool/Documentation/substitutions.rst

(limited to 'tools')

diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
index 2d2ceb7163f6..342716f74ec4 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
@@ -9,13 +9,14 @@ tool for inspection of BTF data
 
 :Manual section: 8
 
+.. include:: substitutions.rst
+
 SYNOPSIS
 ========
 
 	**bpftool** [*OPTIONS*] **btf** *COMMAND*
 
-	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | {**-d** | **--debug** } |
-	{ **-B** | **--base-btf** } }
+	*OPTIONS* := { |COMMON_OPTIONS| | { **-B** | **--base-btf** } }
 
 	*COMMANDS* := { **dump** | **help** }
 
diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
index b954faeb0f07..a17e9aa314fd 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
@@ -9,13 +9,14 @@ tool for inspection and simple manipulation of eBPF progs
 
 :Manual section: 8
 
+.. include:: substitutions.rst
+
 SYNOPSIS
 ========
 
 	**bpftool** [*OPTIONS*] **cgroup** *COMMAND*
 
-	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } |
-	{ **-f** | **--bpffs** } }
+	*OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } }
 
 	*COMMANDS* :=
 	{ **show** | **list** | **tree** | **attach** | **detach** | **help** }
diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst
index b1471788a15f..4ce9a77bc1e0 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst
@@ -9,12 +9,14 @@ tool for inspection of eBPF-related parameters for Linux kernel or net device
 
 :Manual section: 8
 
+.. include:: substitutions.rst
+
 SYNOPSIS
 ========
 
 	**bpftool** [*OPTIONS*] **feature** *COMMAND*
 
-	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } }
+	*OPTIONS* := { |COMMON_OPTIONS| }
 
 	*COMMANDS* := { **probe** | **help** }
 
diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst
index 51e2e8de5208..bc276388f432 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst
@@ -9,13 +9,14 @@ tool for BPF code-generation
 
 :Manual section: 8
 
+.. include:: substitutions.rst
+
 SYNOPSIS
 ========
 
 	**bpftool** [*OPTIONS*] **gen** *COMMAND*
 
-	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } |
-	{ **-L** | **--use-loader** } }
+	*OPTIONS* := { |COMMON_OPTIONS| | { **-L** | **--use-loader** } }
 
 	*COMMAND* := { **object** | **skeleton** | **help** }
 
diff --git a/tools/bpf/bpftool/Documentation/bpftool-iter.rst b/tools/bpf/bpftool/Documentation/bpftool-iter.rst
index 51914c9e8a54..84839d488621 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-iter.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-iter.rst
@@ -9,12 +9,14 @@ tool to create BPF iterators
 
 :Manual section: 8
 
+.. include:: substitutions.rst
+
 SYNOPSIS
 ========
 
 	**bpftool** [*OPTIONS*] **iter** *COMMAND*
 
-	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } }
+	*OPTIONS* := { |COMMON_OPTIONS| }
 
 	*COMMANDS* := { **pin** | **help** }
 
diff --git a/tools/bpf/bpftool/Documentation/bpftool-link.rst b/tools/bpf/bpftool/Documentation/bpftool-link.rst
index 31371bcf605a..52a4eee4af54 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-link.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-link.rst
@@ -9,13 +9,14 @@ tool for inspection and simple manipulation of eBPF links
 
 :Manual section: 8
 
+.. include:: substitutions.rst
+
 SYNOPSIS
 ========
 
 	**bpftool** [*OPTIONS*] **link** *COMMAND*
 
-	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } |
-	{ **-f** | **--bpffs** } | { **-n** | **--nomount** } }
+	*OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-n** | **--nomount** } }
 
 	*COMMANDS* := { **show** | **list** | **pin** | **help** }
 
diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst
index e22c918c069c..7c188a598444 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
@@ -9,13 +9,14 @@ tool for inspection and simple manipulation of eBPF maps
 
 :Manual section: 8
 
+.. include:: substitutions.rst
+
 SYNOPSIS
 ========
 
 	**bpftool** [*OPTIONS*] **map** *COMMAND*
 
-	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } |
-	{ **-f** | **--bpffs** } | { **-n** | **--nomount** } }
+	*OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-n** | **--nomount** } }
 
 	*COMMANDS* :=
 	{ **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** |
diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst
index 6d1aa374529f..f4e0a516335a 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-net.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst
@@ -9,12 +9,14 @@ tool for inspection of netdev/tc related bpf prog attachments
 
 :Manual section: 8
 
+.. include:: substitutions.rst
+
 SYNOPSIS
 ========
 
 	**bpftool** [*OPTIONS*] **net** *COMMAND*
 
-	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } }
+	*OPTIONS* := { |COMMON_OPTIONS| }
 
 	*COMMANDS* :=
 	{ **show** | **list** | **attach** | **detach** | **help** }
diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
index ad554806faa2..5fea633a82f1 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-perf.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
@@ -9,12 +9,14 @@ tool for inspection of perf related bpf prog attachments
 
 :Manual section: 8
 
+.. include:: substitutions.rst
+
 SYNOPSIS
 ========
 
 	**bpftool** [*OPTIONS*] **perf** *COMMAND*
 
-	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } }
+	*OPTIONS* := { |COMMON_OPTIONS| }
 
 	*COMMANDS* :=
 	{ **show** | **list** | **help** }
diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index d31148571403..a2e9359e554c 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -9,12 +9,14 @@ tool for inspection and simple manipulation of eBPF progs
 
 :Manual section: 8
 
+.. include:: substitutions.rst
+
 SYNOPSIS
 ========
 
 	**bpftool** [*OPTIONS*] **prog** *COMMAND*
 
-	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } |
+	*OPTIONS* := { |COMMON_OPTIONS| |
 	{ **-f** | **--bpffs** } | { **-m** | **--mapcompat** } | { **-n** | **--nomount** } |
 	{ **-L** | **--use-loader** } }
 
diff --git a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst
index 77b845b5ac61..ee53a122c0c7 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst
@@ -9,12 +9,14 @@ tool to register/unregister/introspect BPF struct_ops
 
 :Manual section: 8
 
+.. include:: substitutions.rst
+
 SYNOPSIS
 ========
 
 	**bpftool** [*OPTIONS*] **struct_ops** *COMMAND*
 
-	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } }
+	*OPTIONS* := { |COMMON_OPTIONS| }
 
 	*COMMANDS* :=
 	{ **show** | **list** | **dump** | **register** | **unregister** | **help** }
diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst
index 1248b35e67ae..7084dd9fa2f8 100644
--- a/tools/bpf/bpftool/Documentation/bpftool.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool.rst
@@ -9,6 +9,8 @@ tool for inspection and simple manipulation of eBPF programs and maps
 
 :Manual section: 8
 
+.. include:: substitutions.rst
+
 SYNOPSIS
 ========
 
@@ -20,8 +22,7 @@ SYNOPSIS
 
 	*OBJECT* := { **map** | **program** | **cgroup** | **perf** | **net** | **feature** }
 
-	*OPTIONS* := { { **-V** | **--version** } |
-	{ **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } }
+	*OPTIONS* := { { **-V** | **--version** } | |COMMON_OPTIONS| }
 
 	*MAP-COMMANDS* :=
 	{ **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** |
diff --git a/tools/bpf/bpftool/Documentation/substitutions.rst b/tools/bpf/bpftool/Documentation/substitutions.rst
new file mode 100644
index 000000000000..ccf1ffa0686c
--- /dev/null
+++ b/tools/bpf/bpftool/Documentation/substitutions.rst
@@ -0,0 +1,3 @@
+.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+
+.. |COMMON_OPTIONS| replace:: { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } | { **-l** | **--legacy** }
diff --git a/tools/testing/selftests/bpf/test_bpftool_synctypes.py b/tools/testing/selftests/bpf/test_bpftool_synctypes.py
index be54b7335a76..3f6e562565ec 100755
--- a/tools/testing/selftests/bpf/test_bpftool_synctypes.py
+++ b/tools/testing/selftests/bpf/test_bpftool_synctypes.py
@@ -242,12 +242,6 @@ class FileExtractor(object):
         end_marker = re.compile('}\\\\n')
         return self.__get_description_list(start_marker, pattern, end_marker)
 
-    def default_options(self):
-        """
-        Return the default options contained in HELP_SPEC_OPTIONS
-        """
-        return { '-j', '--json', '-p', '--pretty', '-d', '--debug' }
-
     def get_bashcomp_list(self, block_name):
         """
         Search for and parse a list of type names from a variable in bash
@@ -274,7 +268,56 @@ class SourceFileExtractor(FileExtractor):
     defined in children classes.
     """
     def get_options(self):
-        return self.default_options().union(self.get_help_list_macro('HELP_SPEC_OPTIONS'))
+        return self.get_help_list_macro('HELP_SPEC_OPTIONS')
+
+class MainHeaderFileExtractor(SourceFileExtractor):
+    """
+    An extractor for bpftool's main.h
+    """
+    filename = os.path.join(BPFTOOL_DIR, 'main.h')
+
+    def get_common_options(self):
+        """
+        Parse the list of common options in main.h (options that apply to all
+        commands), which looks to the lists of options in other source files
+        but has different start and end markers:
+
+            "OPTIONS := { {-j|--json} [{-p|--pretty}] | {-d|--debug} | {-l|--legacy}"
+
+        Return a set containing all options, such as:
+
+            {'-p', '-d', '--legacy', '--pretty', '--debug', '--json', '-l', '-j'}
+        """
+        start_marker = re.compile(f'"OPTIONS :=')
+        pattern = re.compile('([\w-]+) ?(?:\||}[ }\]"])')
+        end_marker = re.compile('#define')
+
+        parser = InlineListParser(self.reader)
+        parser.search_block(start_marker)
+        return parser.parse(pattern, end_marker)
+
+class ManSubstitutionsExtractor(SourceFileExtractor):
+    """
+    An extractor for substitutions.rst
+    """
+    filename = os.path.join(BPFTOOL_DIR, 'Documentation/substitutions.rst')
+
+    def get_common_options(self):
+        """
+        Parse the list of common options in substitutions.rst (options that
+        apply to all commands).
+
+        Return a set containing all options, such as:
+
+            {'-p', '-d', '--legacy', '--pretty', '--debug', '--json', '-l', '-j'}
+        """
+        start_marker = re.compile('\|COMMON_OPTIONS\| replace:: {')
+        pattern = re.compile('\*\*([\w/-]+)\*\*')
+        end_marker = re.compile('}$')
+
+        parser = InlineListParser(self.reader)
+        parser.search_block(start_marker)
+        return parser.parse(pattern, end_marker)
 
 class ProgFileExtractor(SourceFileExtractor):
     """
@@ -580,6 +623,19 @@ def main():
     verify(help_main_options, man_main_options,
             f'Comparing {source_main_info.filename} (do_help() OPTIONS) and {man_main_info.filename} (OPTIONS):')
 
+    # Compare common options (options that apply to all commands)
+
+    main_hdr_info = MainHeaderFileExtractor()
+    source_common_options = main_hdr_info.get_common_options()
+    main_hdr_info.close()
+
+    man_substitutions = ManSubstitutionsExtractor()
+    man_common_options = man_substitutions.get_common_options()
+    man_substitutions.close()
+
+    verify(source_common_options, man_common_options,
+            f'Comparing common options from {main_hdr_info.filename} (HELP_SPEC_OPTIONS) and {man_substitutions.filename}:')
+
     sys.exit(retval)
 
 if __name__ == "__main__":
-- 
cgit v1.2.3


From e12cd158c8a45b3926cc2f42151384a2d7fdcec3 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin@isovalent.com>
Date: Mon, 15 Nov 2021 22:58:44 +0000
Subject: selftests/bpf: Configure dir paths via env in
 test_bpftool_synctypes.py

Script test_bpftool_synctypes.py parses a number of files in the bpftool
directory (or even elsewhere in the repo) to make sure that the list of
types or options in those different files are consistent. Instead of
having fixed paths, let's make the directories configurable through
environment variable. This should make easier in the future to run the
script in a different setup, for example on an out-of-tree bpftool
mirror with a different layout.

Signed-off-by: Quentin Monnet <quentin@isovalent.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211115225844.33943-4-quentin@isovalent.com
---
 .../selftests/bpf/test_bpftool_synctypes.py        | 26 ++++++++++++++--------
 1 file changed, 17 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_bpftool_synctypes.py b/tools/testing/selftests/bpf/test_bpftool_synctypes.py
index 3f6e562565ec..6bf21e47882a 100755
--- a/tools/testing/selftests/bpf/test_bpftool_synctypes.py
+++ b/tools/testing/selftests/bpf/test_bpftool_synctypes.py
@@ -9,7 +9,15 @@ import os, sys
 
 LINUX_ROOT = os.path.abspath(os.path.join(__file__,
     os.pardir, os.pardir, os.pardir, os.pardir, os.pardir))
-BPFTOOL_DIR = os.path.join(LINUX_ROOT, 'tools/bpf/bpftool')
+BPFTOOL_DIR = os.getenv('BPFTOOL_DIR',
+    os.path.join(LINUX_ROOT, 'tools/bpf/bpftool'))
+BPFTOOL_BASHCOMP_DIR = os.getenv('BPFTOOL_BASHCOMP_DIR',
+    os.path.join(BPFTOOL_DIR, 'bash-completion'))
+BPFTOOL_DOC_DIR = os.getenv('BPFTOOL_DOC_DIR',
+    os.path.join(BPFTOOL_DIR, 'Documentation'))
+INCLUDE_DIR = os.getenv('INCLUDE_DIR',
+    os.path.join(LINUX_ROOT, 'tools/include'))
+
 retval = 0
 
 class BlockParser(object):
@@ -300,7 +308,7 @@ class ManSubstitutionsExtractor(SourceFileExtractor):
     """
     An extractor for substitutions.rst
     """
-    filename = os.path.join(BPFTOOL_DIR, 'Documentation/substitutions.rst')
+    filename = os.path.join(BPFTOOL_DOC_DIR, 'substitutions.rst')
 
     def get_common_options(self):
         """
@@ -393,7 +401,7 @@ class BpfHeaderExtractor(FileExtractor):
     """
     An extractor for the UAPI BPF header.
     """
-    filename = os.path.join(LINUX_ROOT, 'tools/include/uapi/linux/bpf.h')
+    filename = os.path.join(INCLUDE_DIR, 'uapi/linux/bpf.h')
 
     def get_prog_types(self):
         return self.get_enum('bpf_prog_type')
@@ -417,7 +425,7 @@ class ManProgExtractor(ManPageExtractor):
     """
     An extractor for bpftool-prog.rst.
     """
-    filename = os.path.join(BPFTOOL_DIR, 'Documentation/bpftool-prog.rst')
+    filename = os.path.join(BPFTOOL_DOC_DIR, 'bpftool-prog.rst')
 
     def get_attach_types(self):
         return self.get_rst_list('ATTACH_TYPE')
@@ -426,7 +434,7 @@ class ManMapExtractor(ManPageExtractor):
     """
     An extractor for bpftool-map.rst.
     """
-    filename = os.path.join(BPFTOOL_DIR, 'Documentation/bpftool-map.rst')
+    filename = os.path.join(BPFTOOL_DOC_DIR, 'bpftool-map.rst')
 
     def get_map_types(self):
         return self.get_rst_list('TYPE')
@@ -435,7 +443,7 @@ class ManCgroupExtractor(ManPageExtractor):
     """
     An extractor for bpftool-cgroup.rst.
     """
-    filename = os.path.join(BPFTOOL_DIR, 'Documentation/bpftool-cgroup.rst')
+    filename = os.path.join(BPFTOOL_DOC_DIR, 'bpftool-cgroup.rst')
 
     def get_attach_types(self):
         return self.get_rst_list('ATTACH_TYPE')
@@ -454,7 +462,7 @@ class BashcompExtractor(FileExtractor):
     """
     An extractor for bpftool's bash completion file.
     """
-    filename = os.path.join(BPFTOOL_DIR, 'bash-completion/bpftool')
+    filename = os.path.join(BPFTOOL_BASHCOMP_DIR, 'bpftool')
 
     def get_prog_attach_types(self):
         return self.get_bashcomp_list('BPFTOOL_PROG_ATTACH_TYPES')
@@ -605,7 +613,7 @@ def main():
         help_cmd_options = source_info.get_options()
         source_info.close()
 
-        man_cmd_info = ManGenericExtractor(os.path.join('Documentation', 'bpftool-' + cmd + '.rst'))
+        man_cmd_info = ManGenericExtractor(os.path.join(BPFTOOL_DOC_DIR, 'bpftool-' + cmd + '.rst'))
         man_cmd_options = man_cmd_info.get_options()
         man_cmd_info.close()
 
@@ -616,7 +624,7 @@ def main():
     help_main_options = source_main_info.get_options()
     source_main_info.close()
 
-    man_main_info = ManGenericExtractor(os.path.join('Documentation', 'bpftool.rst'))
+    man_main_info = ManGenericExtractor(os.path.join(BPFTOOL_DOC_DIR, 'bpftool.rst'))
     man_main_options = man_main_info.get_options()
     man_main_info.close()
 
-- 
cgit v1.2.3


From ebf7f6f0a6cdcc17a3da52b81e4b3a98c4005028 Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Fri, 5 Nov 2021 09:30:00 +0800
Subject: bpf: Change value of MAX_TAIL_CALL_CNT from 32 to 33
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the current code, the actual max tail call count is 33 which is greater
than MAX_TAIL_CALL_CNT (defined as 32). The actual limit is not consistent
with the meaning of MAX_TAIL_CALL_CNT and thus confusing at first glance.
We can see the historical evolution from commit 04fd61ab36ec ("bpf: allow
bpf programs to tail-call other bpf programs") and commit f9dabe016b63
("bpf: Undo off-by-one in interpreter tail call count limit"). In order
to avoid changing existing behavior, the actual limit is 33 now, this is
reasonable.

After commit 874be05f525e ("bpf, tests: Add tail call test suite"), we can
see there exists failed testcase.

On all archs when CONFIG_BPF_JIT_ALWAYS_ON is not set:
 # echo 0 > /proc/sys/net/core/bpf_jit_enable
 # modprobe test_bpf
 # dmesg | grep -w FAIL
 Tail call error path, max count reached jited:0 ret 34 != 33 FAIL

On some archs:
 # echo 1 > /proc/sys/net/core/bpf_jit_enable
 # modprobe test_bpf
 # dmesg | grep -w FAIL
 Tail call error path, max count reached jited:1 ret 34 != 33 FAIL

Although the above failed testcase has been fixed in commit 18935a72eb25
("bpf/tests: Fix error in tail call limit tests"), it would still be good
to change the value of MAX_TAIL_CALL_CNT from 32 to 33 to make the code
more readable.

The 32-bit x86 JIT was using a limit of 32, just fix the wrong comments and
limit to 33 tail calls as the constant MAX_TAIL_CALL_CNT updated. For the
mips64 JIT, use "ori" instead of "addiu" as suggested by Johan Almbladh.
For the riscv JIT, use RV_REG_TCC directly to save one register move as
suggested by Björn Töpel. For the other implementations, no function changes,
it does not change the current limit 33, the new value of MAX_TAIL_CALL_CNT
can reflect the actual max tail call count, the related tail call testcases
in test_bpf module and selftests can work well for the interpreter and the
JIT.

Here are the test results on x86_64:

 # uname -m
 x86_64
 # echo 0 > /proc/sys/net/core/bpf_jit_enable
 # modprobe test_bpf test_suite=test_tail_calls
 # dmesg | tail -1
 test_bpf: test_tail_calls: Summary: 8 PASSED, 0 FAILED, [0/8 JIT'ed]
 # rmmod test_bpf
 # echo 1 > /proc/sys/net/core/bpf_jit_enable
 # modprobe test_bpf test_suite=test_tail_calls
 # dmesg | tail -1
 test_bpf: test_tail_calls: Summary: 8 PASSED, 0 FAILED, [8/8 JIT'ed]
 # rmmod test_bpf
 # ./test_progs -t tailcalls
 #142 tailcalls:OK
 Summary: 1/11 PASSED, 0 SKIPPED, 0 FAILED

Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Johan Almbladh <johan.almbladh@anyfinetworks.com>
Tested-by: Ilya Leoshkevich <iii@linux.ibm.com>
Acked-by: Björn Töpel <bjorn@kernel.org>
Acked-by: Johan Almbladh <johan.almbladh@anyfinetworks.com>
Acked-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/bpf/1636075800-3264-1-git-send-email-yangtiezhu@loongson.cn
---
 arch/arm/net/bpf_jit_32.c         |  5 +++--
 arch/arm64/net/bpf_jit_comp.c     |  5 +++--
 arch/mips/net/bpf_jit_comp32.c    |  3 +--
 arch/mips/net/bpf_jit_comp64.c    |  2 +-
 arch/powerpc/net/bpf_jit_comp32.c |  4 ++--
 arch/powerpc/net/bpf_jit_comp64.c |  4 ++--
 arch/riscv/net/bpf_jit_comp32.c   |  6 ++----
 arch/riscv/net/bpf_jit_comp64.c   |  7 +++----
 arch/s390/net/bpf_jit_comp.c      |  6 +++---
 arch/sparc/net/bpf_jit_comp_64.c  |  2 +-
 arch/x86/net/bpf_jit_comp.c       | 10 +++++-----
 arch/x86/net/bpf_jit_comp32.c     |  4 ++--
 include/linux/bpf.h               |  2 +-
 include/uapi/linux/bpf.h          |  2 +-
 kernel/bpf/core.c                 |  3 ++-
 lib/test_bpf.c                    |  4 ++--
 tools/include/uapi/linux/bpf.h    |  2 +-
 17 files changed, 35 insertions(+), 36 deletions(-)

(limited to 'tools')

diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index eeb6dc0ecf46..e59b41e9ab0c 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -1199,7 +1199,8 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx)
 
 	/* tmp2[0] = array, tmp2[1] = index */
 
-	/* if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+	/*
+	 * if (tail_call_cnt >= MAX_TAIL_CALL_CNT)
 	 *	goto out;
 	 * tail_call_cnt++;
 	 */
@@ -1208,7 +1209,7 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx)
 	tc = arm_bpf_get_reg64(tcc, tmp, ctx);
 	emit(ARM_CMP_I(tc[0], hi), ctx);
 	_emit(ARM_COND_EQ, ARM_CMP_I(tc[1], lo), ctx);
-	_emit(ARM_COND_HI, ARM_B(jmp_offset), ctx);
+	_emit(ARM_COND_CS, ARM_B(jmp_offset), ctx);
 	emit(ARM_ADDS_I(tc[1], tc[1], 1), ctx);
 	emit(ARM_ADC_I(tc[0], tc[0], 0), ctx);
 	arm_bpf_put_reg64(tcc, tmp, ctx);
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 86c9dc0681cc..07c12c42b751 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -287,13 +287,14 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx)
 	emit(A64_CMP(0, r3, tmp), ctx);
 	emit(A64_B_(A64_COND_CS, jmp_offset), ctx);
 
-	/* if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+	/*
+	 * if (tail_call_cnt >= MAX_TAIL_CALL_CNT)
 	 *     goto out;
 	 * tail_call_cnt++;
 	 */
 	emit_a64_mov_i64(tmp, MAX_TAIL_CALL_CNT, ctx);
 	emit(A64_CMP(1, tcc, tmp), ctx);
-	emit(A64_B_(A64_COND_HI, jmp_offset), ctx);
+	emit(A64_B_(A64_COND_CS, jmp_offset), ctx);
 	emit(A64_ADD_I(1, tcc, tcc, 1), ctx);
 
 	/* prog = array->ptrs[index];
diff --git a/arch/mips/net/bpf_jit_comp32.c b/arch/mips/net/bpf_jit_comp32.c
index bd996ede12f8..044b11b65bca 100644
--- a/arch/mips/net/bpf_jit_comp32.c
+++ b/arch/mips/net/bpf_jit_comp32.c
@@ -1381,8 +1381,7 @@ void build_prologue(struct jit_context *ctx)
 	 * 16-byte area in the parent's stack frame. On a tail call, the
 	 * calling function jumps into the prologue after these instructions.
 	 */
-	emit(ctx, ori, MIPS_R_T9, MIPS_R_ZERO,
-	     min(MAX_TAIL_CALL_CNT + 1, 0xffff));
+	emit(ctx, ori, MIPS_R_T9, MIPS_R_ZERO, min(MAX_TAIL_CALL_CNT, 0xffff));
 	emit(ctx, sw, MIPS_R_T9, 0, MIPS_R_SP);
 
 	/*
diff --git a/arch/mips/net/bpf_jit_comp64.c b/arch/mips/net/bpf_jit_comp64.c
index 815ade724227..6475828ffb36 100644
--- a/arch/mips/net/bpf_jit_comp64.c
+++ b/arch/mips/net/bpf_jit_comp64.c
@@ -552,7 +552,7 @@ void build_prologue(struct jit_context *ctx)
 	 * On a tail call, the calling function jumps into the prologue
 	 * after this instruction.
 	 */
-	emit(ctx, addiu, tc, MIPS_R_ZERO, min(MAX_TAIL_CALL_CNT + 1, 0xffff));
+	emit(ctx, ori, tc, MIPS_R_ZERO, min(MAX_TAIL_CALL_CNT, 0xffff));
 
 	/* === Entry-point for tail calls === */
 
diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c
index 0da31d41d413..8a4faa05f9e4 100644
--- a/arch/powerpc/net/bpf_jit_comp32.c
+++ b/arch/powerpc/net/bpf_jit_comp32.c
@@ -221,13 +221,13 @@ static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 o
 	PPC_BCC(COND_GE, out);
 
 	/*
-	 * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+	 * if (tail_call_cnt >= MAX_TAIL_CALL_CNT)
 	 *   goto out;
 	 */
 	EMIT(PPC_RAW_CMPLWI(_R0, MAX_TAIL_CALL_CNT));
 	/* tail_call_cnt++; */
 	EMIT(PPC_RAW_ADDIC(_R0, _R0, 1));
-	PPC_BCC(COND_GT, out);
+	PPC_BCC(COND_GE, out);
 
 	/* prog = array->ptrs[index]; */
 	EMIT(PPC_RAW_RLWINM(_R3, b2p_index, 2, 0, 29));
diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 8b5157ccfeba..8571aafcc9e1 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -228,12 +228,12 @@ static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 o
 	PPC_BCC(COND_GE, out);
 
 	/*
-	 * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+	 * if (tail_call_cnt >= MAX_TAIL_CALL_CNT)
 	 *   goto out;
 	 */
 	PPC_BPF_LL(b2p[TMP_REG_1], 1, bpf_jit_stack_tailcallcnt(ctx));
 	EMIT(PPC_RAW_CMPLWI(b2p[TMP_REG_1], MAX_TAIL_CALL_CNT));
-	PPC_BCC(COND_GT, out);
+	PPC_BCC(COND_GE, out);
 
 	/*
 	 * tail_call_cnt++;
diff --git a/arch/riscv/net/bpf_jit_comp32.c b/arch/riscv/net/bpf_jit_comp32.c
index e6497424cbf6..529a83b85c1c 100644
--- a/arch/riscv/net/bpf_jit_comp32.c
+++ b/arch/riscv/net/bpf_jit_comp32.c
@@ -799,11 +799,10 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx)
 	emit_bcc(BPF_JGE, lo(idx_reg), RV_REG_T1, off, ctx);
 
 	/*
-	 * temp_tcc = tcc - 1;
-	 * if (tcc < 0)
+	 * if (--tcc < 0)
 	 *   goto out;
 	 */
-	emit(rv_addi(RV_REG_T1, RV_REG_TCC, -1), ctx);
+	emit(rv_addi(RV_REG_TCC, RV_REG_TCC, -1), ctx);
 	off = ninsns_rvoff(tc_ninsn - (ctx->ninsns - start_insn));
 	emit_bcc(BPF_JSLT, RV_REG_TCC, RV_REG_ZERO, off, ctx);
 
@@ -829,7 +828,6 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx)
 	if (is_12b_check(off, insn))
 		return -1;
 	emit(rv_lw(RV_REG_T0, off, RV_REG_T0), ctx);
-	emit(rv_addi(RV_REG_TCC, RV_REG_T1, 0), ctx);
 	/* Epilogue jumps to *(t0 + 4). */
 	__build_epilogue(true, ctx);
 	return 0;
diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
index f2a779c7e225..603630b6f3c5 100644
--- a/arch/riscv/net/bpf_jit_comp64.c
+++ b/arch/riscv/net/bpf_jit_comp64.c
@@ -327,12 +327,12 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx)
 	off = ninsns_rvoff(tc_ninsn - (ctx->ninsns - start_insn));
 	emit_branch(BPF_JGE, RV_REG_A2, RV_REG_T1, off, ctx);
 
-	/* if (TCC-- < 0)
+	/* if (--TCC < 0)
 	 *     goto out;
 	 */
-	emit_addi(RV_REG_T1, tcc, -1, ctx);
+	emit_addi(RV_REG_TCC, tcc, -1, ctx);
 	off = ninsns_rvoff(tc_ninsn - (ctx->ninsns - start_insn));
-	emit_branch(BPF_JSLT, tcc, RV_REG_ZERO, off, ctx);
+	emit_branch(BPF_JSLT, RV_REG_TCC, RV_REG_ZERO, off, ctx);
 
 	/* prog = array->ptrs[index];
 	 * if (!prog)
@@ -352,7 +352,6 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx)
 	if (is_12b_check(off, insn))
 		return -1;
 	emit_ld(RV_REG_T3, off, RV_REG_T2, ctx);
-	emit_mv(RV_REG_TCC, RV_REG_T1, ctx);
 	__build_epilogue(true, ctx);
 	return 0;
 }
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 233cc9bcd652..9ff2bd83aad7 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -1369,7 +1369,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
 				 jit->prg);
 
 		/*
-		 * if (tail_call_cnt++ > MAX_TAIL_CALL_CNT)
+		 * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
 		 *         goto out;
 		 */
 
@@ -1381,9 +1381,9 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
 		EMIT4_IMM(0xa7080000, REG_W0, 1);
 		/* laal %w1,%w0,off(%r15) */
 		EMIT6_DISP_LH(0xeb000000, 0x00fa, REG_W1, REG_W0, REG_15, off);
-		/* clij %w1,MAX_TAIL_CALL_CNT,0x2,out */
+		/* clij %w1,MAX_TAIL_CALL_CNT-1,0x2,out */
 		patch_2_clij = jit->prg;
-		EMIT6_PCREL_RIEC(0xec000000, 0x007f, REG_W1, MAX_TAIL_CALL_CNT,
+		EMIT6_PCREL_RIEC(0xec000000, 0x007f, REG_W1, MAX_TAIL_CALL_CNT - 1,
 				 2, jit->prg);
 
 		/*
diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c
index 9a2f20cbd48b..0bfe1c72a0c9 100644
--- a/arch/sparc/net/bpf_jit_comp_64.c
+++ b/arch/sparc/net/bpf_jit_comp_64.c
@@ -867,7 +867,7 @@ static void emit_tail_call(struct jit_ctx *ctx)
 	emit(LD32 | IMMED | RS1(SP) | S13(off) | RD(tmp), ctx);
 	emit_cmpi(tmp, MAX_TAIL_CALL_CNT, ctx);
 #define OFFSET2 13
-	emit_branch(BGU, ctx->idx, ctx->idx + OFFSET2, ctx);
+	emit_branch(BGEU, ctx->idx, ctx->idx + OFFSET2, ctx);
 	emit_nop(ctx);
 
 	emit_alu_K(ADD, tmp, 1, ctx);
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 726700fabca6..631847907786 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -412,7 +412,7 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip)
  * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
  *   if (index >= array->map.max_entries)
  *     goto out;
- *   if (++tail_call_cnt > MAX_TAIL_CALL_CNT)
+ *   if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
  *     goto out;
  *   prog = array->ptrs[index];
  *   if (prog == NULL)
@@ -446,14 +446,14 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
 	EMIT2(X86_JBE, offset);                   /* jbe out */
 
 	/*
-	 * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+	 * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
 	 *	goto out;
 	 */
 	EMIT2_off32(0x8B, 0x85, tcc_off);         /* mov eax, dword ptr [rbp - tcc_off] */
 	EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT);     /* cmp eax, MAX_TAIL_CALL_CNT */
 
 	offset = ctx->tail_call_indirect_label - (prog + 2 - start);
-	EMIT2(X86_JA, offset);                    /* ja out */
+	EMIT2(X86_JAE, offset);                   /* jae out */
 	EMIT3(0x83, 0xC0, 0x01);                  /* add eax, 1 */
 	EMIT2_off32(0x89, 0x85, tcc_off);         /* mov dword ptr [rbp - tcc_off], eax */
 
@@ -504,14 +504,14 @@ static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke,
 	int offset;
 
 	/*
-	 * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+	 * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
 	 *	goto out;
 	 */
 	EMIT2_off32(0x8B, 0x85, tcc_off);             /* mov eax, dword ptr [rbp - tcc_off] */
 	EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT);         /* cmp eax, MAX_TAIL_CALL_CNT */
 
 	offset = ctx->tail_call_direct_label - (prog + 2 - start);
-	EMIT2(X86_JA, offset);                        /* ja out */
+	EMIT2(X86_JAE, offset);                       /* jae out */
 	EMIT3(0x83, 0xC0, 0x01);                      /* add eax, 1 */
 	EMIT2_off32(0x89, 0x85, tcc_off);             /* mov dword ptr [rbp - tcc_off], eax */
 
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index da9b7cfa4632..429a89c5468b 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -1323,7 +1323,7 @@ static void emit_bpf_tail_call(u8 **pprog, u8 *ip)
 	EMIT2(IA32_JBE, jmp_label(jmp_label1, 2));
 
 	/*
-	 * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+	 * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
 	 *     goto out;
 	 */
 	lo = (u32)MAX_TAIL_CALL_CNT;
@@ -1337,7 +1337,7 @@ static void emit_bpf_tail_call(u8 **pprog, u8 *ip)
 	/* cmp ecx,lo */
 	EMIT3(0x83, add_1reg(0xF8, IA32_ECX), lo);
 
-	/* ja out */
+	/* jae out */
 	EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
 
 	/* add eax,0x1 */
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 56098c866704..cc7a0c36e7df 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1081,7 +1081,7 @@ struct bpf_array {
 };
 
 #define BPF_COMPLEXITY_LIMIT_INSNS      1000000 /* yes. 1M insns */
-#define MAX_TAIL_CALL_CNT 32
+#define MAX_TAIL_CALL_CNT 33
 
 #define BPF_F_ACCESS_MASK	(BPF_F_RDONLY |		\
 				 BPF_F_RDONLY_PROG |	\
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6297eafdc40f..a69e4b04ffeb 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1744,7 +1744,7 @@ union bpf_attr {
  * 		if the maximum number of tail calls has been reached for this
  * 		chain of programs. This limit is defined in the kernel by the
  * 		macro **MAX_TAIL_CALL_CNT** (not accessible to user space),
- * 		which is currently set to 32.
+ *		which is currently set to 33.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 2405e39d800f..b52dc845ecea 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1574,7 +1574,8 @@ select_insn:
 
 		if (unlikely(index >= array->map.max_entries))
 			goto out;
-		if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
+
+		if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT))
 			goto out;
 
 		tail_call_cnt++;
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index adae39567264..0c5cb2d6436a 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -14683,7 +14683,7 @@ static struct tail_call_test tail_call_tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.flags = FLAG_NEED_STATE | FLAG_RESULT_IN_STATE,
-		.result = (MAX_TAIL_CALL_CNT + 1 + 1) * MAX_TESTRUNS,
+		.result = (MAX_TAIL_CALL_CNT + 1) * MAX_TESTRUNS,
 	},
 	{
 		"Tail call count preserved across function calls",
@@ -14705,7 +14705,7 @@ static struct tail_call_test tail_call_tests[] = {
 		},
 		.stack_depth = 8,
 		.flags = FLAG_NEED_STATE | FLAG_RESULT_IN_STATE,
-		.result = (MAX_TAIL_CALL_CNT + 1 + 1) * MAX_TESTRUNS,
+		.result = (MAX_TAIL_CALL_CNT + 1) * MAX_TESTRUNS,
 	},
 	{
 		"Tail call error path, NULL target",
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 6297eafdc40f..a69e4b04ffeb 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1744,7 +1744,7 @@ union bpf_attr {
  * 		if the maximum number of tail calls has been reached for this
  * 		chain of programs. This limit is defined in the kernel by the
  * 		macro **MAX_TAIL_CALL_CNT** (not accessible to user space),
- * 		which is currently set to 32.
+ *		which is currently set to 33.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
-- 
cgit v1.2.3


From d41bc48bfab2076f7db88d079a3a3203dd9c4a54 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 15 Nov 2021 17:30:41 -0800
Subject: selftests/bpf: Add uprobe triggering overhead benchmarks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add benchmark to measure overhead of uprobes and uretprobes. Also have
a baseline (no uprobe attached) benchmark.

On my dev machine, baseline benchmark can trigger 130M user_target()
invocations. When uprobe is attached, this falls to just 700K. With
uretprobe, we get down to 520K:

  $ sudo ./bench trig-uprobe-base -a
  Summary: hits  131.289 ± 2.872M/s

  # UPROBE
  $ sudo ./bench -a trig-uprobe-without-nop
  Summary: hits    0.729 ± 0.007M/s

  $ sudo ./bench -a trig-uprobe-with-nop
  Summary: hits    1.798 ± 0.017M/s

  # URETPROBE
  $ sudo ./bench -a trig-uretprobe-without-nop
  Summary: hits    0.508 ± 0.012M/s

  $ sudo ./bench -a trig-uretprobe-with-nop
  Summary: hits    0.883 ± 0.008M/s

So there is almost 2.5x performance difference between probing nop vs
non-nop instruction for entry uprobe. And 1.7x difference for uretprobe.

This means that non-nop uprobe overhead is around 1.4 microseconds for uprobe
and 2 microseconds for non-nop uretprobe.

For nop variants, uprobe and uretprobe overhead is down to 0.556 and
1.13 microseconds, respectively.

For comparison, just doing a very low-overhead syscall (with no BPF
programs attached anywhere) gives:

  $ sudo ./bench trig-base -a
  Summary: hits    4.830 ± 0.036M/s

So uprobes are about 2.67x slower than pure context switch.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211116013041.4072571-1-andrii@kernel.org
---
 tools/testing/selftests/bpf/Makefile               |   4 +-
 tools/testing/selftests/bpf/bench.c                |  10 ++
 tools/testing/selftests/bpf/benchs/bench_trigger.c | 146 +++++++++++++++++++++
 tools/testing/selftests/bpf/progs/trigger_bench.c  |   7 +
 4 files changed, 166 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 0470802c907c..35684d61aaeb 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -533,7 +533,9 @@ $(OUTPUT)/bench_ringbufs.o: $(OUTPUT)/ringbuf_bench.skel.h \
 $(OUTPUT)/bench_bloom_filter_map.o: $(OUTPUT)/bloom_filter_bench.skel.h
 $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ)
 $(OUTPUT)/bench: LDLIBS += -lm
-$(OUTPUT)/bench: $(OUTPUT)/bench.o $(OUTPUT)/testing_helpers.o \
+$(OUTPUT)/bench: $(OUTPUT)/bench.o \
+		 $(OUTPUT)/testing_helpers.o \
+		 $(OUTPUT)/trace_helpers.o \
 		 $(OUTPUT)/bench_count.o \
 		 $(OUTPUT)/bench_rename.o \
 		 $(OUTPUT)/bench_trigger.o \
diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index cc4722f693e9..c75e7ee28746 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -359,6 +359,11 @@ extern const struct bench bench_trig_kprobe;
 extern const struct bench bench_trig_fentry;
 extern const struct bench bench_trig_fentry_sleep;
 extern const struct bench bench_trig_fmodret;
+extern const struct bench bench_trig_uprobe_base;
+extern const struct bench bench_trig_uprobe_with_nop;
+extern const struct bench bench_trig_uretprobe_with_nop;
+extern const struct bench bench_trig_uprobe_without_nop;
+extern const struct bench bench_trig_uretprobe_without_nop;
 extern const struct bench bench_rb_libbpf;
 extern const struct bench bench_rb_custom;
 extern const struct bench bench_pb_libbpf;
@@ -385,6 +390,11 @@ static const struct bench *benchs[] = {
 	&bench_trig_fentry,
 	&bench_trig_fentry_sleep,
 	&bench_trig_fmodret,
+	&bench_trig_uprobe_base,
+	&bench_trig_uprobe_with_nop,
+	&bench_trig_uretprobe_with_nop,
+	&bench_trig_uprobe_without_nop,
+	&bench_trig_uretprobe_without_nop,
 	&bench_rb_libbpf,
 	&bench_rb_custom,
 	&bench_pb_libbpf,
diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c
index f41a491a8cc0..049a5ad56f65 100644
--- a/tools/testing/selftests/bpf/benchs/bench_trigger.c
+++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c
@@ -2,6 +2,7 @@
 /* Copyright (c) 2020 Facebook */
 #include "bench.h"
 #include "trigger_bench.skel.h"
+#include "trace_helpers.h"
 
 /* BPF triggering benchmarks */
 static struct trigger_ctx {
@@ -107,6 +108,101 @@ static void *trigger_consumer(void *input)
 	return NULL;
 }
 
+/* make sure call is not inlined and not avoided by compiler, so __weak and
+ * inline asm volatile in the body of the function
+ *
+ * There is a performance difference between uprobing at nop location vs other
+ * instructions. So use two different targets, one of which starts with nop
+ * and another doesn't.
+ *
+ * GCC doesn't generate stack setup preample for these functions due to them
+ * having no input arguments and doing nothing in the body.
+ */
+__weak void uprobe_target_with_nop(void)
+{
+	asm volatile ("nop");
+}
+
+__weak void uprobe_target_without_nop(void)
+{
+	asm volatile ("");
+}
+
+static void *uprobe_base_producer(void *input)
+{
+	while (true) {
+		uprobe_target_with_nop();
+		atomic_inc(&base_hits.value);
+	}
+	return NULL;
+}
+
+static void *uprobe_producer_with_nop(void *input)
+{
+	while (true)
+		uprobe_target_with_nop();
+	return NULL;
+}
+
+static void *uprobe_producer_without_nop(void *input)
+{
+	while (true)
+		uprobe_target_without_nop();
+	return NULL;
+}
+
+static void usetup(bool use_retprobe, bool use_nop)
+{
+	size_t uprobe_offset;
+	ssize_t base_addr;
+	struct bpf_link *link;
+
+	setup_libbpf();
+
+	ctx.skel = trigger_bench__open_and_load();
+	if (!ctx.skel) {
+		fprintf(stderr, "failed to open skeleton\n");
+		exit(1);
+	}
+
+	base_addr = get_base_addr();
+	if (use_nop)
+		uprobe_offset = get_uprobe_offset(&uprobe_target_with_nop, base_addr);
+	else
+		uprobe_offset = get_uprobe_offset(&uprobe_target_without_nop, base_addr);
+
+	link = bpf_program__attach_uprobe(ctx.skel->progs.bench_trigger_uprobe,
+					  use_retprobe,
+					  -1 /* all PIDs */,
+					  "/proc/self/exe",
+					  uprobe_offset);
+	if (!link) {
+		fprintf(stderr, "failed to attach uprobe!\n");
+		exit(1);
+	}
+	ctx.skel->links.bench_trigger_uprobe = link;
+}
+
+static void uprobe_setup_with_nop()
+{
+	usetup(false, true);
+}
+
+static void uretprobe_setup_with_nop()
+{
+	usetup(true, true);
+}
+
+static void uprobe_setup_without_nop()
+{
+	usetup(false, false);
+}
+
+static void uretprobe_setup_without_nop()
+{
+	usetup(true, false);
+}
+
 const struct bench bench_trig_base = {
 	.name = "trig-base",
 	.validate = trigger_validate,
@@ -182,3 +278,53 @@ const struct bench bench_trig_fmodret = {
 	.report_progress = hits_drops_report_progress,
 	.report_final = hits_drops_report_final,
 };
+
+const struct bench bench_trig_uprobe_base = {
+	.name = "trig-uprobe-base",
+	.setup = NULL, /* no uprobe/uretprobe is attached */
+	.producer_thread = uprobe_base_producer,
+	.consumer_thread = trigger_consumer,
+	.measure = trigger_base_measure,
+	.report_progress = hits_drops_report_progress,
+	.report_final = hits_drops_report_final,
+};
+
+const struct bench bench_trig_uprobe_with_nop = {
+	.name = "trig-uprobe-with-nop",
+	.setup = uprobe_setup_with_nop,
+	.producer_thread = uprobe_producer_with_nop,
+	.consumer_thread = trigger_consumer,
+	.measure = trigger_measure,
+	.report_progress = hits_drops_report_progress,
+	.report_final = hits_drops_report_final,
+};
+
+const struct bench bench_trig_uretprobe_with_nop = {
+	.name = "trig-uretprobe-with-nop",
+	.setup = uretprobe_setup_with_nop,
+	.producer_thread = uprobe_producer_with_nop,
+	.consumer_thread = trigger_consumer,
+	.measure = trigger_measure,
+	.report_progress = hits_drops_report_progress,
+	.report_final = hits_drops_report_final,
+};
+
+const struct bench bench_trig_uprobe_without_nop = {
+	.name = "trig-uprobe-without-nop",
+	.setup = uprobe_setup_without_nop,
+	.producer_thread = uprobe_producer_without_nop,
+	.consumer_thread = trigger_consumer,
+	.measure = trigger_measure,
+	.report_progress = hits_drops_report_progress,
+	.report_final = hits_drops_report_final,
+};
+
+const struct bench bench_trig_uretprobe_without_nop = {
+	.name = "trig-uretprobe-without-nop",
+	.setup = uretprobe_setup_without_nop,
+	.producer_thread = uprobe_producer_without_nop,
+	.consumer_thread = trigger_consumer,
+	.measure = trigger_measure,
+	.report_progress = hits_drops_report_progress,
+	.report_final = hits_drops_report_final,
+};
diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c
index 9a4d09590b3d..2098f3f27f18 100644
--- a/tools/testing/selftests/bpf/progs/trigger_bench.c
+++ b/tools/testing/selftests/bpf/progs/trigger_bench.c
@@ -52,3 +52,10 @@ int bench_trigger_fmodret(void *ctx)
 	__sync_add_and_fetch(&hits, 1);
 	return -22;
 }
+
+SEC("uprobe/self/uprobe_target")
+int bench_trigger_uprobe(void *ctx)
+{
+	__sync_add_and_fetch(&hits, 1);
+	return 0;
+}
-- 
cgit v1.2.3


From ea78548e0f98951fa7641037ad98a750137d6b6a Mon Sep 17 00:00:00 2001
From: Yucong Sun <sunyucong@gmail.com>
Date: Fri, 12 Nov 2021 11:25:32 -0800
Subject: selftests/bpf: Move summary line after the error logs

Makes it easier to find the summary line when there is a lot of logs to
scroll back.

Signed-off-by: Yucong Sun <sunyucong@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211112192535.898352-2-fallentree@fb.com
---
 tools/testing/selftests/bpf/test_progs.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index c65986bd9d07..d129ea5c9a48 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -1198,11 +1198,11 @@ static int server_main(void)
 		env.sub_succ_cnt += result->sub_succ_cnt;
 	}
 
+	print_all_error_logs();
+
 	fprintf(stdout, "Summary: %d/%d PASSED, %d SKIPPED, %d FAILED\n",
 		env.succ_cnt, env.sub_succ_cnt, env.skip_cnt, env.fail_cnt);
 
-	print_all_error_logs();
-
 	/* reap all workers */
 	for (i = 0; i < env.workers; i++) {
 		int wstatus, pid;
@@ -1484,11 +1484,11 @@ int main(int argc, char **argv)
 	if (env.list_test_names)
 		goto out;
 
+	print_all_error_logs();
+
 	fprintf(stdout, "Summary: %d/%d PASSED, %d SKIPPED, %d FAILED\n",
 		env.succ_cnt, env.sub_succ_cnt, env.skip_cnt, env.fail_cnt);
 
-	print_all_error_logs();
-
 	close(env.saved_netns_fd);
 out:
 	if (!env.list_test_names && env.has_testmod)
-- 
cgit v1.2.3


From 67d61d30b8a8f33d2a4f269f3a548409827d7b01 Mon Sep 17 00:00:00 2001
From: Yucong Sun <sunyucong@gmail.com>
Date: Fri, 12 Nov 2021 11:25:33 -0800
Subject: selftests/bpf: Variable naming fix

Change log_fd to log_fp to reflect its type correctly.

Signed-off-by: Yucong Sun <sunyucong@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211112192535.898352-3-fallentree@fb.com
---
 tools/testing/selftests/bpf/test_progs.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index d129ea5c9a48..926475aa10bb 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -939,7 +939,7 @@ static void *dispatch_thread(void *ctx)
 {
 	struct dispatch_data *data = ctx;
 	int sock_fd;
-	FILE *log_fd = NULL;
+	FILE *log_fp = NULL;
 
 	sock_fd = data->sock_fd;
 
@@ -1002,8 +1002,8 @@ static void *dispatch_thread(void *ctx)
 
 			/* collect all logs */
 			if (msg_test_done.test_done.have_log) {
-				log_fd = open_memstream(&result->log_buf, &result->log_cnt);
-				if (!log_fd)
+				log_fp = open_memstream(&result->log_buf, &result->log_cnt);
+				if (!log_fp)
 					goto error;
 
 				while (true) {
@@ -1014,12 +1014,12 @@ static void *dispatch_thread(void *ctx)
 					if (msg_log.type != MSG_TEST_LOG)
 						goto error;
 
-					fprintf(log_fd, "%s", msg_log.test_log.log_buf);
+					fprintf(log_fp, "%s", msg_log.test_log.log_buf);
 					if (msg_log.test_log.is_last)
 						break;
 				}
-				fclose(log_fd);
-				log_fd = NULL;
+				fclose(log_fp);
+				log_fp = NULL;
 			}
 			/* output log */
 			{
@@ -1045,8 +1045,8 @@ error:
 	if (env.debug)
 		fprintf(stderr, "[%d]: Protocol/IO error: %s.\n", data->worker_id, strerror(errno));
 
-	if (log_fd)
-		fclose(log_fd);
+	if (log_fp)
+		fclose(log_fp);
 done:
 	{
 		struct msg msg_exit;
-- 
cgit v1.2.3


From db813d7bd919c521b869d657dc4a2a2335974cc4 Mon Sep 17 00:00:00 2001
From: Yucong Sun <sunyucong@gmail.com>
Date: Fri, 12 Nov 2021 11:25:34 -0800
Subject: selftests/bpf: Mark variable as static

Fix warnings from checkstyle.pl

Signed-off-by: Yucong Sun <sunyucong@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211112192535.898352-4-fallentree@fb.com
---
 tools/testing/selftests/bpf/test_progs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 926475aa10bb..296928948bb9 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -473,11 +473,11 @@ static struct prog_test_def prog_test_defs[] = {
 #include <prog_tests/tests.h>
 #undef DEFINE_TEST
 };
-const int prog_test_cnt = ARRAY_SIZE(prog_test_defs);
+static const int prog_test_cnt = ARRAY_SIZE(prog_test_defs);
 
 const char *argp_program_version = "test_progs 0.1";
 const char *argp_program_bug_address = "<bpf@vger.kernel.org>";
-const char argp_program_doc[] = "BPF selftests test runner";
+static const char argp_program_doc[] = "BPF selftests test runner";
 
 enum ARG_KEYS {
 	ARG_TEST_NUM = 'n',
-- 
cgit v1.2.3


From dd7f091fd22b1dce6c20e8f7769aa068ed88ac6d Mon Sep 17 00:00:00 2001
From: Tirthendu Sarkar <tirthendu.sarkar@intel.com>
Date: Wed, 17 Nov 2021 18:06:13 +0530
Subject: selftests/bpf: Fix xdpxceiver failures for no hugepages

xsk_configure_umem() needs hugepages to work in unaligned mode. So when
hugepages are not configured, 'unaligned' tests should be skipped which
is determined by the helper function hugepages_present(). This function
erroneously returns true with MAP_NORESERVE flag even when no hugepages
are configured. The removal of this flag fixes the issue.

The test TEST_TYPE_UNALIGNED_INV_DESC also needs to be skipped when
there are no hugepages. However, this was not skipped as there was no
check for presence of hugepages and hence was failing. The check to skip
the test has now been added.

Fixes: a4ba98dd0c69 (selftests: xsk: Add test for unaligned mode)
Signed-off-by: Tirthendu Sarkar <tirthendu.sarkar@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211117123613.22288-1-tirthendu.sarkar@intel.com
---
 tools/testing/selftests/bpf/xdpxceiver.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c
index fe7f423b8c3f..040164c7efc1 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.c
+++ b/tools/testing/selftests/bpf/xdpxceiver.c
@@ -1217,7 +1217,7 @@ static bool hugepages_present(struct ifobject *ifobject)
 	void *bufs;
 
 	bufs = mmap(NULL, mmap_sz, PROT_READ | PROT_WRITE,
-		    MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE | MAP_HUGETLB, -1, 0);
+		    MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
 	if (bufs == MAP_FAILED)
 		return false;
 
@@ -1364,6 +1364,10 @@ static void run_pkt_test(struct test_spec *test, enum test_mode mode, enum test_
 		testapp_invalid_desc(test);
 		break;
 	case TEST_TYPE_UNALIGNED_INV_DESC:
+		if (!hugepages_present(test->ifobj_tx)) {
+			ksft_test_result_skip("No 2M huge pages present.\n");
+			return;
+		}
 		test_spec_set_name(test, "UNALIGNED_INV_DESC");
 		test->ifobj_tx->umem->unaligned_mode = true;
 		test->ifobj_rx->umem->unaligned_mode = true;
-- 
cgit v1.2.3


From 8ff978b8b222bc9d51dd109a46b51026336c95d8 Mon Sep 17 00:00:00 2001
From: Riccardo Paolo Bestetti <pbl@bestov.io>
Date: Wed, 17 Nov 2021 10:00:11 +0100
Subject: ipv4/raw: support binding to nonlocal addresses

Add support to inet v4 raw sockets for binding to nonlocal addresses
through the IP_FREEBIND and IP_TRANSPARENT socket options, as well as
the ipv4.ip_nonlocal_bind kernel parameter.

Add helper function to inet_sock.h to check for bind address validity on
the base of the address type and whether nonlocal address are enabled
for the socket via any of the sockopts/sysctl, deduplicating checks in
ipv4/ping.c, ipv4/af_inet.c, ipv6/af_inet6.c (for mapped v4->v6
addresses), and ipv4/raw.c.

Add test cases with IP[V6]_FREEBIND verifying that both v4 and v6 raw
sockets support binding to nonlocal addresses after the change. Add
necessary support for the test cases to nettest.

Signed-off-by: Riccardo Paolo Bestetti <pbl@bestov.io>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20211117090010.125393-1-pbl@bestov.io
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/inet_sock.h                   | 12 ++++++++++
 net/ipv4/af_inet.c                        |  7 ++----
 net/ipv4/ping.c                           | 14 ++++-------
 net/ipv4/raw.c                            | 13 +++++-----
 net/ipv6/af_inet6.c                       |  7 ++----
 tools/testing/selftests/net/fcnal-test.sh | 40 +++++++++++++++++++++++++++++++
 tools/testing/selftests/net/nettest.c     | 33 ++++++++++++++++++++++++-
 7 files changed, 100 insertions(+), 26 deletions(-)

(limited to 'tools')

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 9e1111f5915b..234d70ae5f4c 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -372,4 +372,16 @@ static inline bool inet_can_nonlocal_bind(struct net *net,
 		inet->freebind || inet->transparent;
 }
 
+static inline bool inet_addr_valid_or_nonlocal(struct net *net,
+					       struct inet_sock *inet,
+					       __be32 addr,
+					       int addr_type)
+{
+	return inet_can_nonlocal_bind(net, inet) ||
+		addr == htonl(INADDR_ANY) ||
+		addr_type == RTN_LOCAL ||
+		addr_type == RTN_MULTICAST ||
+		addr_type == RTN_BROADCAST;
+}
+
 #endif	/* _INET_SOCK_H */
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 6d52b6491255..c66b0563a267 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -489,11 +489,8 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
 	 *  is temporarily down)
 	 */
 	err = -EADDRNOTAVAIL;
-	if (!inet_can_nonlocal_bind(net, inet) &&
-	    addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
-	    chk_addr_ret != RTN_LOCAL &&
-	    chk_addr_ret != RTN_MULTICAST &&
-	    chk_addr_ret != RTN_BROADCAST)
+	if (!inet_addr_valid_or_nonlocal(net, inet, addr->sin_addr.s_addr,
+	                                 chk_addr_ret))
 		goto out;
 
 	snum = ntohs(addr->sin_port);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 1e44a43acfe2..e540b0dcf085 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -311,15 +311,11 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
 		pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n",
 			 sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port));
 
-		if (addr->sin_addr.s_addr == htonl(INADDR_ANY))
-			chk_addr_ret = RTN_LOCAL;
-		else
-			chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);
-
-		if ((!inet_can_nonlocal_bind(net, isk) &&
-		     chk_addr_ret != RTN_LOCAL) ||
-		    chk_addr_ret == RTN_MULTICAST ||
-		    chk_addr_ret == RTN_BROADCAST)
+		chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);
+
+		if (!inet_addr_valid_or_nonlocal(net, inet_sk(sk),
+					         addr->sin_addr.s_addr,
+	                                         chk_addr_ret))
 			return -EADDRNOTAVAIL;
 
 #if IS_ENABLED(CONFIG_IPV6)
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 3910bb08a986..a53f256bf9d3 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -717,6 +717,7 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
+	struct net *net = sock_net(sk);
 	u32 tb_id = RT_TABLE_LOCAL;
 	int ret = -EINVAL;
 	int chk_addr_ret;
@@ -725,16 +726,16 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		goto out;
 
 	if (sk->sk_bound_dev_if)
-		tb_id = l3mdev_fib_table_by_index(sock_net(sk),
-						 sk->sk_bound_dev_if) ? : tb_id;
+		tb_id = l3mdev_fib_table_by_index(net,
+						  sk->sk_bound_dev_if) ? : tb_id;
 
-	chk_addr_ret = inet_addr_type_table(sock_net(sk), addr->sin_addr.s_addr,
-					    tb_id);
+	chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);
 
 	ret = -EADDRNOTAVAIL;
-	if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
-	    chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
+	if (!inet_addr_valid_or_nonlocal(net, inet, addr->sin_addr.s_addr,
+					 chk_addr_ret))
 		goto out;
+
 	inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
 	if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
 		inet->inet_saddr = 0;  /* Use device */
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 0c4da163535a..1f28c9820c2e 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -337,11 +337,8 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
 		chk_addr_ret = inet_addr_type_dev_table(net, dev, v4addr);
 		rcu_read_unlock();
 
-		if (!inet_can_nonlocal_bind(net, inet) &&
-		    v4addr != htonl(INADDR_ANY) &&
-		    chk_addr_ret != RTN_LOCAL &&
-		    chk_addr_ret != RTN_MULTICAST &&
-		    chk_addr_ret != RTN_BROADCAST) {
+		if (!inet_addr_valid_or_nonlocal(net, inet, v4addr,
+						 chk_addr_ret)) {
 			err = -EADDRNOTAVAIL;
 			goto out;
 		}
diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh
index 3313566ce906..7caa4f0e067d 100755
--- a/tools/testing/selftests/net/fcnal-test.sh
+++ b/tools/testing/selftests/net/fcnal-test.sh
@@ -66,6 +66,10 @@ NSB_LO_IP=172.16.2.2
 NSA_LO_IP6=2001:db8:2::1
 NSB_LO_IP6=2001:db8:2::2
 
+# non-local addresses for freebind tests
+NL_IP=172.17.1.1
+NL_IP6=2001:db8:4::1
+
 MD5_PW=abc123
 MD5_WRONG_PW=abc1234
 
@@ -316,6 +320,9 @@ addr2str()
 	${NSB_LO_IP6})	echo "ns-B loopback IPv6";;
 	${NSB_LINKIP6}|${NSB_LINKIP6}%*) echo "ns-B IPv6 LLA";;
 
+	${NL_IP})       echo "nonlocal IP";;
+	${NL_IP6})      echo "nonlocal IPv6";;
+
 	${VRF_IP})	echo "VRF IP";;
 	${VRF_IP6})	echo "VRF IPv6";;
 
@@ -1767,6 +1774,14 @@ ipv4_addr_bind_novrf()
 		log_test_addr ${a} $? 0 "Raw socket bind to local address after device bind"
 	done
 
+	#
+	# raw socket with nonlocal bind
+	#
+	a=${NL_IP}
+	log_start
+	run_cmd nettest -s -R -P icmp -f -l ${a} -I ${NSA_DEV} -b
+	log_test_addr ${a} $? 0 "Raw socket bind to nonlocal address after device bind"
+
 	#
 	# tcp sockets
 	#
@@ -1815,6 +1830,14 @@ ipv4_addr_bind_vrf()
 	run_cmd nettest -s -R -P icmp -l ${a} -I ${VRF} -b
 	log_test_addr ${a} $? 1 "Raw socket bind to out of scope address after VRF bind"
 
+	#
+	# raw socket with nonlocal bind
+	#
+	a=${NL_IP}
+	log_start
+	run_cmd nettest -s -R -P icmp -f -l ${a} -I ${VRF} -b
+	log_test_addr ${a} $? 0 "Raw socket bind to nonlocal address after VRF bind"
+
 	#
 	# tcp sockets
 	#
@@ -1965,6 +1988,7 @@ ipv4_rt()
 
 	a=${NSA_IP}
 	log_start
+
 	run_cmd nettest ${varg} -s &
 	sleep 1
 	run_cmd nettest ${varg} -d ${NSA_DEV} -r ${a} &
@@ -3402,6 +3426,14 @@ ipv6_addr_bind_novrf()
 		log_test_addr ${a} $? 0 "Raw socket bind to local address after device bind"
 	done
 
+	#
+	# raw socket with nonlocal bind
+	#
+	a=${NL_IP6}
+	log_start
+	run_cmd nettest -6 -s -R -P icmp -f -l ${a} -I ${NSA_DEV} -b
+	log_test_addr ${a} $? 0 "Raw socket bind to nonlocal address"
+
 	#
 	# tcp sockets
 	#
@@ -3443,6 +3475,14 @@ ipv6_addr_bind_vrf()
 	run_cmd nettest -6 -s -R -P ipv6-icmp -l ${a} -I ${VRF} -b
 	log_test_addr ${a} $? 1 "Raw socket bind to invalid local address after vrf bind"
 
+	#
+	# raw socket with nonlocal bind
+	#
+	a=${NL_IP6}
+	log_start
+	run_cmd nettest -6 -s -R -P icmp -f -l ${a} -I ${VRF} -b
+	log_test_addr ${a} $? 0 "Raw socket bind to nonlocal address after VRF bind"
+
 	#
 	# tcp sockets
 	#
diff --git a/tools/testing/selftests/net/nettest.c b/tools/testing/selftests/net/nettest.c
index b599003eb5ba..d9a6fd2cd9d3 100644
--- a/tools/testing/selftests/net/nettest.c
+++ b/tools/testing/selftests/net/nettest.c
@@ -85,6 +85,7 @@ struct sock_args {
 	int version;   /* AF_INET/AF_INET6 */
 
 	int use_setsockopt;
+	int use_freebind;
 	int use_cmsg;
 	const char *dev;
 	const char *server_dev;
@@ -514,6 +515,29 @@ static int set_membership(int sd, uint32_t grp, uint32_t addr, int ifindex)
 	return 0;
 }
 
+static int set_freebind(int sd, int version)
+{
+	unsigned int one = 1;
+	int rc = 0;
+
+	switch (version) {
+	case AF_INET:
+		if (setsockopt(sd, SOL_IP, IP_FREEBIND, &one, sizeof(one))) {
+			log_err_errno("setsockopt(IP_FREEBIND)");
+			rc = -1;
+		}
+		break;
+	case AF_INET6:
+		if (setsockopt(sd, SOL_IPV6, IPV6_FREEBIND, &one, sizeof(one))) {
+			log_err_errno("setsockopt(IPV6_FREEBIND");
+			rc = -1;
+		}
+		break;
+	}
+
+	return rc;
+}
+
 static int set_broadcast(int sd)
 {
 	unsigned int one = 1;
@@ -1419,6 +1443,9 @@ static int lsock_init(struct sock_args *args)
 		 set_unicast_if(sd, args->ifindex, args->version))
 		goto err;
 
+	if (args->use_freebind && set_freebind(sd, args->version))
+		goto err;
+
 	if (bind_socket(sd, args))
 		goto err;
 
@@ -1827,7 +1854,7 @@ static int ipc_parent(int cpid, int fd, struct sock_args *args)
 	return client_status;
 }
 
-#define GETOPT_STR  "sr:l:c:p:t:g:P:DRn:M:X:m:d:I:BN:O:SCi6xL:0:1:2:3:Fbq"
+#define GETOPT_STR  "sr:l:c:p:t:g:P:DRn:M:X:m:d:I:BN:O:SCi6xL:0:1:2:3:Fbqf"
 #define OPT_FORCE_BIND_KEY_IFINDEX 1001
 #define OPT_NO_BIND_KEY_IFINDEX 1002
 
@@ -1864,6 +1891,7 @@ static void print_usage(char *prog)
 	"    -I dev        bind socket to given device name - server mode\n"
 	"    -S            use setsockopt (IP_UNICAST_IF or IP_MULTICAST_IF)\n"
 	"                  to set device binding\n"
+	"    -f            bind socket with the IP[V6]_FREEBIND option\n"
 	"    -C            use cmsg and IP_PKTINFO to specify device binding\n"
 	"\n"
 	"    -L len        send random message of given length\n"
@@ -1999,6 +2027,9 @@ int main(int argc, char *argv[])
 		case 'S':
 			args.use_setsockopt = 1;
 			break;
+		case 'f':
+			args.use_freebind = 1;
+			break;
 		case 'C':
 			args.use_cmsg = 1;
 			break;
-- 
cgit v1.2.3


From 29ad850a5cae84757bcd4c60e0d74232ef8c5157 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Thu, 18 Nov 2021 12:52:25 +0100
Subject: selfetests/bpf: Adapt vmtest.sh to s390 libbpf CI changes

[1] added s390 support to libbpf CI and added an ${ARCH} prefix to a
number of paths and identifiers in libbpf GitHub repo, which vmtest.sh
relies upon. Update these and make use of the new s390 support.

[1] https://github.com/libbpf/libbpf/pull/204

Co-developed-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211118115225.1349726-1-iii@linux.ibm.com
---
 tools/testing/selftests/bpf/vmtest.sh | 46 +++++++++++++++++++++++------------
 1 file changed, 31 insertions(+), 15 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/vmtest.sh b/tools/testing/selftests/bpf/vmtest.sh
index 027198768fad..5e43c79ddc6e 100755
--- a/tools/testing/selftests/bpf/vmtest.sh
+++ b/tools/testing/selftests/bpf/vmtest.sh
@@ -4,17 +4,34 @@
 set -u
 set -e
 
-# This script currently only works for x86_64, as
-# it is based on the VM image used by the BPF CI which is
-# x86_64.
-QEMU_BINARY="${QEMU_BINARY:="qemu-system-x86_64"}"
-X86_BZIMAGE="arch/x86/boot/bzImage"
+# This script currently only works for x86_64 and s390x, as
+# it is based on the VM image used by the BPF CI, which is
+# available only for these architectures.
+ARCH="$(uname -m)"
+case "${ARCH}" in
+s390x)
+	QEMU_BINARY=qemu-system-s390x
+	QEMU_CONSOLE="ttyS1"
+	QEMU_FLAGS=(-smp 2)
+	BZIMAGE="arch/s390/boot/compressed/vmlinux"
+	;;
+x86_64)
+	QEMU_BINARY=qemu-system-x86_64
+	QEMU_CONSOLE="ttyS0,115200"
+	QEMU_FLAGS=(-cpu host -smp 8)
+	BZIMAGE="arch/x86/boot/bzImage"
+	;;
+*)
+	echo "Unsupported architecture"
+	exit 1
+	;;
+esac
 DEFAULT_COMMAND="./test_progs"
 MOUNT_DIR="mnt"
 ROOTFS_IMAGE="root.img"
 OUTPUT_DIR="$HOME/.bpf_selftests"
-KCONFIG_URL="https://raw.githubusercontent.com/libbpf/libbpf/master/travis-ci/vmtest/configs/latest.config"
-KCONFIG_API_URL="https://api.github.com/repos/libbpf/libbpf/contents/travis-ci/vmtest/configs/latest.config"
+KCONFIG_URL="https://raw.githubusercontent.com/libbpf/libbpf/master/travis-ci/vmtest/configs/config-latest.${ARCH}"
+KCONFIG_API_URL="https://api.github.com/repos/libbpf/libbpf/contents/travis-ci/vmtest/configs/config-latest.${ARCH}"
 INDEX_URL="https://raw.githubusercontent.com/libbpf/libbpf/master/travis-ci/vmtest/configs/INDEX"
 NUM_COMPILE_JOBS="$(nproc)"
 LOG_FILE_BASE="$(date +"bpf_selftests.%Y-%m-%d_%H-%M-%S")"
@@ -85,7 +102,7 @@ newest_rootfs_version()
 {
 	{
 	for file in "${!URLS[@]}"; do
-		if [[ $file =~ ^libbpf-vmtest-rootfs-(.*)\.tar\.zst$ ]]; then
+		if [[ $file =~ ^"${ARCH}"/libbpf-vmtest-rootfs-(.*)\.tar\.zst$ ]]; then
 			echo "${BASH_REMATCH[1]}"
 		fi
 	done
@@ -102,7 +119,7 @@ download_rootfs()
 		exit 1
 	fi
 
-	download "libbpf-vmtest-rootfs-$rootfsversion.tar.zst" |
+	download "${ARCH}/libbpf-vmtest-rootfs-$rootfsversion.tar.zst" |
 		zstd -d | sudo tar -C "$dir" -x
 }
 
@@ -224,13 +241,12 @@ EOF
 		-nodefaults \
 		-display none \
 		-serial mon:stdio \
-		-cpu host \
+		"${qemu_flags[@]}" \
 		-enable-kvm \
-		-smp 8 \
 		-m 4G \
 		-drive file="${rootfs_img}",format=raw,index=1,media=disk,if=virtio,cache=none \
 		-kernel "${kernel_bzimage}" \
-		-append "root=/dev/vda rw console=ttyS0,115200"
+		-append "root=/dev/vda rw console=${QEMU_CONSOLE}"
 }
 
 copy_logs()
@@ -282,7 +298,7 @@ main()
 	local kernel_checkout=$(realpath "${script_dir}"/../../../../)
 	# By default the script searches for the kernel in the checkout directory but
 	# it also obeys environment variables O= and KBUILD_OUTPUT=
-	local kernel_bzimage="${kernel_checkout}/${X86_BZIMAGE}"
+	local kernel_bzimage="${kernel_checkout}/${BZIMAGE}"
 	local command="${DEFAULT_COMMAND}"
 	local update_image="no"
 	local exit_command="poweroff -f"
@@ -337,13 +353,13 @@ main()
 		if is_rel_path "${O}"; then
 			O="$(realpath "${PWD}/${O}")"
 		fi
-		kernel_bzimage="${O}/${X86_BZIMAGE}"
+		kernel_bzimage="${O}/${BZIMAGE}"
 		make_command="${make_command} O=${O}"
 	elif [[ "${KBUILD_OUTPUT:=""}" != "" ]]; then
 		if is_rel_path "${KBUILD_OUTPUT}"; then
 			KBUILD_OUTPUT="$(realpath "${PWD}/${KBUILD_OUTPUT}")"
 		fi
-		kernel_bzimage="${KBUILD_OUTPUT}/${X86_BZIMAGE}"
+		kernel_bzimage="${KBUILD_OUTPUT}/${BZIMAGE}"
 		make_command="${make_command} KBUILD_OUTPUT=${KBUILD_OUTPUT}"
 	fi
 
-- 
cgit v1.2.3


From 7615209f42a1976894cd0df97a380a034911656a Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 18 Nov 2021 09:40:54 -0800
Subject: libbpf: Add runtime APIs to query libbpf version
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Libbpf provided LIBBPF_MAJOR_VERSION and LIBBPF_MINOR_VERSION macros to
check libbpf version at compilation time. This doesn't cover all the
needs, though, because version of libbpf that application is compiled
against doesn't necessarily match the version of libbpf at runtime,
especially if libbpf is used as a shared library.

Add libbpf_major_version() and libbpf_minor_version() returning major
and minor versions, respectively, as integers. Also add a convenience
libbpf_version_string() for various tooling using libbpf to print out
libbpf version in a human-readable form. Currently it will return
"v0.6", but in the future it can contains some extra information, so the
format itself is not part of a stable API and shouldn't be relied upon.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/bpf/20211118174054.2699477-1-andrii@kernel.org
---
 tools/lib/bpf/libbpf.c   | 19 +++++++++++++++++++
 tools/lib/bpf/libbpf.h   |  4 ++++
 tools/lib/bpf/libbpf.map |  3 +++
 3 files changed, 26 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index de7e09a6b5ec..78de238f975a 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -168,6 +168,25 @@ int libbpf_set_strict_mode(enum libbpf_strict_mode mode)
 	return 0;
 }
 
+__u32 libbpf_major_version(void)
+{
+	return LIBBPF_MAJOR_VERSION;
+}
+
+__u32 libbpf_minor_version(void)
+{
+	return LIBBPF_MINOR_VERSION;
+}
+
+const char *libbpf_version_string(void)
+{
+#define __S(X) #X
+#define _S(X) __S(X)
+	return  "v" _S(LIBBPF_MAJOR_VERSION) "." _S(LIBBPF_MINOR_VERSION);
+#undef _S
+#undef __S
+}
+
 enum kern_feature_id {
 	/* v4.14: kernel support for program & map names. */
 	FEAT_PROG_NAME,
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 4ec69f224342..003fdc5cf3a8 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -24,6 +24,10 @@
 extern "C" {
 #endif
 
+LIBBPF_API __u32 libbpf_major_version(void);
+LIBBPF_API __u32 libbpf_minor_version(void);
+LIBBPF_API const char *libbpf_version_string(void);
+
 enum libbpf_errno {
 	__LIBBPF_ERRNO__START = 4000,
 
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 6a59514a48cf..bea6791272e5 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -410,6 +410,9 @@ LIBBPF_0.6.0 {
 		btf__type_cnt;
 		btf_dump__new;
 		btf_dump__new_deprecated;
+		libbpf_major_version;
+		libbpf_minor_version;
+		libbpf_version_string;
 		perf_buffer__new;
 		perf_buffer__new_deprecated;
 		perf_buffer__new_raw;
-- 
cgit v1.2.3


From efdd3eb8015e7447095f02a26eaabd164cd18004 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 17 Nov 2021 11:41:13 -0800
Subject: libbpf: Accommodate DWARF/compiler bug with duplicated structs

According to [0], compilers sometimes might produce duplicate DWARF
definitions for exactly the same struct/union within the same
compilation unit (CU). We've had similar issues with identical arrays
and handled them with a similar workaround in 6b6e6b1d09aa ("libbpf:
Accomodate DWARF/compiler bug with duplicated identical arrays"). Do the
same for struct/union by ensuring that two structs/unions are exactly
the same, down to the integer values of field referenced type IDs.

Solving this more generically (allowing referenced types to be
equivalent, but using different type IDs, all within a single CU)
requires a huge complexity increase to handle many-to-many mappings
between canonidal and candidate type graphs. Before we invest in that,
let's see if this approach handles all the instances of this issue in
practice. Thankfully it's pretty rare, it seems.

  [0] https://lore.kernel.org/bpf/YXr2NFlJTAhHdZqq@krava/

Reported-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211117194114.347675-1-andrii@kernel.org
---
 tools/lib/bpf/btf.c | 45 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 41 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index b6be579e0dc6..e97217a77196 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -3477,8 +3477,8 @@ static long btf_hash_struct(struct btf_type *t)
 }
 
 /*
- * Check structural compatibility of two FUNC_PROTOs, ignoring referenced type
- * IDs. This check is performed during type graph equivalence check and
+ * Check structural compatibility of two STRUCTs/UNIONs, ignoring referenced
+ * type IDs. This check is performed during type graph equivalence check and
  * referenced types equivalence is checked separately.
  */
 static bool btf_shallow_equal_struct(struct btf_type *t1, struct btf_type *t2)
@@ -3851,6 +3851,31 @@ static int btf_dedup_identical_arrays(struct btf_dedup *d, __u32 id1, __u32 id2)
 	return btf_equal_array(t1, t2);
 }
 
+/* Check if given two types are identical STRUCT/UNION definitions */
+static bool btf_dedup_identical_structs(struct btf_dedup *d, __u32 id1, __u32 id2)
+{
+	const struct btf_member *m1, *m2;
+	struct btf_type *t1, *t2;
+	int n, i;
+
+	t1 = btf_type_by_id(d->btf, id1);
+	t2 = btf_type_by_id(d->btf, id2);
+
+	if (!btf_is_composite(t1) || btf_kind(t1) != btf_kind(t2))
+		return false;
+
+	if (!btf_shallow_equal_struct(t1, t2))
+		return false;
+
+	m1 = btf_members(t1);
+	m2 = btf_members(t2);
+	for (i = 0, n = btf_vlen(t1); i < n; i++, m1++, m2++) {
+		if (m1->type != m2->type)
+			return false;
+	}
+	return true;
+}
+
 /*
  * Check equivalence of BTF type graph formed by candidate struct/union (we'll
  * call it "candidate graph" in this description for brevity) to a type graph
@@ -3962,6 +3987,8 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id,
 
 	hypot_type_id = d->hypot_map[canon_id];
 	if (hypot_type_id <= BTF_MAX_NR_TYPES) {
+		if (hypot_type_id == cand_id)
+			return 1;
 		/* In some cases compiler will generate different DWARF types
 		 * for *identical* array type definitions and use them for
 		 * different fields within the *same* struct. This breaks type
@@ -3970,8 +3997,18 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id,
 		 * types within a single CU. So work around that by explicitly
 		 * allowing identical array types here.
 		 */
-		return hypot_type_id == cand_id ||
-		       btf_dedup_identical_arrays(d, hypot_type_id, cand_id);
+		if (btf_dedup_identical_arrays(d, hypot_type_id, cand_id))
+			return 1;
+		/* It turns out that similar situation can happen with
+		 * struct/union sometimes, sigh... Handle the case where
+		 * structs/unions are exactly the same, down to the referenced
+		 * type IDs. Anything more complicated (e.g., if referenced
+		 * types are different, but equivalent) is *way more*
+		 * complicated and requires a many-to-many equivalence mapping.
+		 */
+		if (btf_dedup_identical_structs(d, hypot_type_id, cand_id))
+			return 1;
+		return 0;
 	}
 
 	if (btf_dedup_hypot_map_add(d, canon_id, cand_id))
-- 
cgit v1.2.3


From 9a49afe6f5a516eb33bec24be0f81cb35ca79445 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 17 Nov 2021 11:41:14 -0800
Subject: selftests/bpf: Add btf_dedup case with duplicated structs within CU

Add an artificial minimal example simulating compilers producing two
different types within a single CU that correspond to identical struct
definitions.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211117194114.347675-2-andrii@kernel.org
---
 .../selftests/bpf/prog_tests/btf_dedup_split.c     | 113 +++++++++++++++++++++
 1 file changed, 113 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c b/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c
index 9d3b8d7a1537..94ff9757557a 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c
@@ -314,6 +314,117 @@ cleanup:
 	btf__free(btf1);
 }
 
+static void btf_add_dup_struct_in_cu(struct btf *btf, int start_id)
+{
+#define ID(n) (start_id + n)
+	btf__set_pointer_size(btf, 8); /* enforce 64-bit arch */
+
+	btf__add_int(btf, "int", 4, BTF_INT_SIGNED);    /* [1] int */
+
+	btf__add_struct(btf, "s", 8);                   /* [2] struct s { */
+	btf__add_field(btf, "a", ID(3), 0, 0);          /*      struct anon a; */
+	btf__add_field(btf, "b", ID(4), 0, 0);          /*      struct anon b; */
+							/* } */
+
+	btf__add_struct(btf, "(anon)", 8);              /* [3] struct anon { */
+	btf__add_field(btf, "f1", ID(1), 0, 0);         /*      int f1; */
+	btf__add_field(btf, "f2", ID(1), 32, 0);        /*      int f2; */
+							/* } */
+
+	btf__add_struct(btf, "(anon)", 8);              /* [4] struct anon { */
+	btf__add_field(btf, "f1", ID(1), 0, 0);         /*      int f1; */
+	btf__add_field(btf, "f2", ID(1), 32, 0);        /*      int f2; */
+							/* } */
+#undef ID
+}
+
+static void test_split_dup_struct_in_cu()
+{
+	struct btf *btf1, *btf2;
+	int err;
+
+	/* generate the base data.. */
+	btf1 = btf__new_empty();
+	if (!ASSERT_OK_PTR(btf1, "empty_main_btf"))
+		return;
+
+	btf_add_dup_struct_in_cu(btf1, 0);
+
+	VALIDATE_RAW_BTF(
+			btf1,
+			"[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED",
+			"[2] STRUCT 's' size=8 vlen=2\n"
+			"\t'a' type_id=3 bits_offset=0\n"
+			"\t'b' type_id=4 bits_offset=0",
+			"[3] STRUCT '(anon)' size=8 vlen=2\n"
+			"\t'f1' type_id=1 bits_offset=0\n"
+			"\t'f2' type_id=1 bits_offset=32",
+			"[4] STRUCT '(anon)' size=8 vlen=2\n"
+			"\t'f1' type_id=1 bits_offset=0\n"
+			"\t'f2' type_id=1 bits_offset=32");
+
+	/* ..dedup them... */
+	err = btf__dedup(btf1, NULL, NULL);
+	if (!ASSERT_OK(err, "btf_dedup"))
+		goto cleanup;
+
+	VALIDATE_RAW_BTF(
+			btf1,
+			"[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED",
+			"[2] STRUCT 's' size=8 vlen=2\n"
+			"\t'a' type_id=3 bits_offset=0\n"
+			"\t'b' type_id=3 bits_offset=0",
+			"[3] STRUCT '(anon)' size=8 vlen=2\n"
+			"\t'f1' type_id=1 bits_offset=0\n"
+			"\t'f2' type_id=1 bits_offset=32");
+
+	/* and add the same data on top of it */
+	btf2 = btf__new_empty_split(btf1);
+	if (!ASSERT_OK_PTR(btf2, "empty_split_btf"))
+		goto cleanup;
+
+	btf_add_dup_struct_in_cu(btf2, 3);
+
+	VALIDATE_RAW_BTF(
+			btf2,
+			"[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED",
+			"[2] STRUCT 's' size=8 vlen=2\n"
+			"\t'a' type_id=3 bits_offset=0\n"
+			"\t'b' type_id=3 bits_offset=0",
+			"[3] STRUCT '(anon)' size=8 vlen=2\n"
+			"\t'f1' type_id=1 bits_offset=0\n"
+			"\t'f2' type_id=1 bits_offset=32",
+			"[4] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED",
+			"[5] STRUCT 's' size=8 vlen=2\n"
+			"\t'a' type_id=6 bits_offset=0\n"
+			"\t'b' type_id=7 bits_offset=0",
+			"[6] STRUCT '(anon)' size=8 vlen=2\n"
+			"\t'f1' type_id=4 bits_offset=0\n"
+			"\t'f2' type_id=4 bits_offset=32",
+			"[7] STRUCT '(anon)' size=8 vlen=2\n"
+			"\t'f1' type_id=4 bits_offset=0\n"
+			"\t'f2' type_id=4 bits_offset=32");
+
+	err = btf__dedup(btf2, NULL, NULL);
+	if (!ASSERT_OK(err, "btf_dedup"))
+		goto cleanup;
+
+	/* after dedup it should match the original data */
+	VALIDATE_RAW_BTF(
+			btf2,
+			"[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED",
+			"[2] STRUCT 's' size=8 vlen=2\n"
+			"\t'a' type_id=3 bits_offset=0\n"
+			"\t'b' type_id=3 bits_offset=0",
+			"[3] STRUCT '(anon)' size=8 vlen=2\n"
+			"\t'f1' type_id=1 bits_offset=0\n"
+			"\t'f2' type_id=1 bits_offset=32");
+
+cleanup:
+	btf__free(btf2);
+	btf__free(btf1);
+}
+
 void test_btf_dedup_split()
 {
 	if (test__start_subtest("split_simple"))
@@ -322,4 +433,6 @@ void test_btf_dedup_split()
 		test_split_struct_duped();
 	if (test__start_subtest("split_fwd_resolve"))
 		test_split_fwd_resolve();
+	if (test__start_subtest("split_dup_struct_in_cu"))
+		test_split_dup_struct_in_cu();
 }
-- 
cgit v1.2.3


From 8cccee9e91e19207671b94af40bacf7c1d2e74ef Mon Sep 17 00:00:00 2001
From: Florent Revest <revest@chromium.org>
Date: Fri, 19 Nov 2021 19:00:35 +0100
Subject: libbpf: Change bpf_program__set_extra_flags to bpf_program__set_flags

bpf_program__set_extra_flags has just been introduced so we can still
change it without breaking users.

This new interface is a bit more flexible (for example if someone wants
to clear a flag).

Signed-off-by: Florent Revest <revest@chromium.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211119180035.1396139-1-revest@chromium.org
---
 tools/lib/bpf/libbpf.c                        | 4 ++--
 tools/lib/bpf/libbpf.h                        | 2 +-
 tools/lib/bpf/libbpf.map                      | 2 +-
 tools/testing/selftests/bpf/testing_helpers.c | 4 +++-
 4 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 78de238f975a..af405c38aadc 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -8315,12 +8315,12 @@ __u32 bpf_program__flags(const struct bpf_program *prog)
 	return prog->prog_flags;
 }
 
-int bpf_program__set_extra_flags(struct bpf_program *prog, __u32 extra_flags)
+int bpf_program__set_flags(struct bpf_program *prog, __u32 flags)
 {
 	if (prog->obj->loaded)
 		return libbpf_err(-EBUSY);
 
-	prog->prog_flags |= extra_flags;
+	prog->prog_flags = flags;
 	return 0;
 }
 
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 003fdc5cf3a8..d02139fec4ac 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -498,7 +498,7 @@ bpf_program__set_expected_attach_type(struct bpf_program *prog,
 				      enum bpf_attach_type type);
 
 LIBBPF_API __u32 bpf_program__flags(const struct bpf_program *prog);
-LIBBPF_API int bpf_program__set_extra_flags(struct bpf_program *prog, __u32 extra_flags);
+LIBBPF_API int bpf_program__set_flags(struct bpf_program *prog, __u32 flags);
 
 LIBBPF_API int
 bpf_program__set_attach_target(struct bpf_program *prog, int attach_prog_fd,
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index bea6791272e5..69bc069f0a68 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -400,7 +400,7 @@ LIBBPF_0.6.0 {
 		bpf_program__flags;
 		bpf_program__insn_cnt;
 		bpf_program__insns;
-		bpf_program__set_extra_flags;
+		bpf_program__set_flags;
 		btf__add_btf;
 		btf__add_decl_tag;
 		btf__add_type_tag;
diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c
index 52c2f24e0898..0f1c37ac6f2c 100644
--- a/tools/testing/selftests/bpf/testing_helpers.c
+++ b/tools/testing/selftests/bpf/testing_helpers.c
@@ -91,6 +91,7 @@ int bpf_prog_test_load(const char *file, enum bpf_prog_type type,
 	struct bpf_object_load_attr attr = {};
 	struct bpf_object *obj;
 	struct bpf_program *prog;
+	__u32 flags;
 	int err;
 
 	obj = bpf_object__open(file);
@@ -106,7 +107,8 @@ int bpf_prog_test_load(const char *file, enum bpf_prog_type type,
 	if (type != BPF_PROG_TYPE_UNSPEC)
 		bpf_program__set_type(prog, type);
 
-	bpf_program__set_extra_flags(prog, BPF_F_TEST_RND_HI32);
+	flags = bpf_program__flags(prog) | BPF_F_TEST_RND_HI32;
+	bpf_program__set_flags(prog, flags);
 
 	attr.obj = obj;
 	attr.log_level = extra_prog_load_log_flags;
-- 
cgit v1.2.3


From 5fb62e9cd3adadd95303447ee8e3f62ee98b0e73 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 19 Nov 2021 12:41:37 -0800
Subject: selftests: mptcp: add tproxy test case

No hard dependencies here, just skip if test environ lacks
nft binary or the needed kernel config options.

The test case spawns listener in ns2 but ns1 will connect
to the ip address of ns4.

policy routing + tproxy rule will redirect packets to ns2 instead
of forward.

v3: - update mptcp/config (Mat Martineau)
    - more verbose SKIP messages in mptcp_connect.sh

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/mptcp/config           |  8 ++-
 tools/testing/selftests/net/mptcp/mptcp_connect.c  | 51 +++++++++++++-
 tools/testing/selftests/net/mptcp/mptcp_connect.sh | 80 ++++++++++++++++++++++
 3 files changed, 136 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/config b/tools/testing/selftests/net/mptcp/config
index 0faaccd21447..419e71560fd1 100644
--- a/tools/testing/selftests/net/mptcp/config
+++ b/tools/testing/selftests/net/mptcp/config
@@ -13,5 +13,9 @@ CONFIG_NFT_COUNTER=m
 CONFIG_NFT_COMPAT=m
 CONFIG_NETFILTER_XTABLES=m
 CONFIG_NETFILTER_XT_MATCH_BPF=m
-CONFIG_NF_TABLES_IPV4=y
-CONFIG_NF_TABLES_IPV6=y
+CONFIG_NF_TABLES_INET=y
+CONFIG_NFT_TPROXY=m
+CONFIG_NFT_SOCKET=m
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c
index 95e81d557b08..ada9b80774d4 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.c
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c
@@ -75,7 +75,12 @@ struct cfg_cmsg_types {
 	unsigned int timestampns:1;
 };
 
+struct cfg_sockopt_types {
+	unsigned int transparent:1;
+};
+
 static struct cfg_cmsg_types cfg_cmsg_types;
+static struct cfg_sockopt_types cfg_sockopt_types;
 
 static void die_usage(void)
 {
@@ -93,6 +98,7 @@ static void die_usage(void)
 	fprintf(stderr, "\t-u -- check mptcp ulp\n");
 	fprintf(stderr, "\t-w num -- wait num sec before closing the socket\n");
 	fprintf(stderr, "\t-c cmsg -- test cmsg type <cmsg>\n");
+	fprintf(stderr, "\t-o option -- test sockopt <option>\n");
 	fprintf(stderr,
 		"\t-P [saveWithPeek|saveAfterPeek] -- save data with/after MSG_PEEK form tcp socket\n");
 	exit(1);
@@ -185,6 +191,22 @@ static void set_mark(int fd, uint32_t mark)
 	}
 }
 
+static void set_transparent(int fd, int pf)
+{
+	int one = 1;
+
+	switch (pf) {
+	case AF_INET:
+		if (-1 == setsockopt(fd, SOL_IP, IP_TRANSPARENT, &one, sizeof(one)))
+			perror("IP_TRANSPARENT");
+		break;
+	case AF_INET6:
+		if (-1 == setsockopt(fd, IPPROTO_IPV6, IPV6_TRANSPARENT, &one, sizeof(one)))
+			perror("IPV6_TRANSPARENT");
+		break;
+	}
+}
+
 static int sock_listen_mptcp(const char * const listenaddr,
 			     const char * const port)
 {
@@ -212,6 +234,9 @@ static int sock_listen_mptcp(const char * const listenaddr,
 				     sizeof(one)))
 			perror("setsockopt");
 
+		if (cfg_sockopt_types.transparent)
+			set_transparent(sock, pf);
+
 		if (bind(sock, a->ai_addr, a->ai_addrlen) == 0)
 			break; /* success */
 
@@ -944,6 +969,27 @@ static void parse_cmsg_types(const char *type)
 	exit(1);
 }
 
+static void parse_setsock_options(const char *name)
+{
+	char *next = strchr(name, ',');
+	unsigned int len = 0;
+
+	if (next) {
+		parse_setsock_options(next + 1);
+		len = next - name;
+	} else {
+		len = strlen(name);
+	}
+
+	if (strncmp(name, "TRANSPARENT", len) == 0) {
+		cfg_sockopt_types.transparent = 1;
+		return;
+	}
+
+	fprintf(stderr, "Unrecognized setsockopt option %s\n", name);
+	exit(1);
+}
+
 int main_loop(void)
 {
 	int fd;
@@ -1047,7 +1093,7 @@ static void parse_opts(int argc, char **argv)
 {
 	int c;
 
-	while ((c = getopt(argc, argv, "6jr:lp:s:hut:T:m:S:R:w:M:P:c:")) != -1) {
+	while ((c = getopt(argc, argv, "6jr:lp:s:hut:T:m:S:R:w:M:P:c:o:")) != -1) {
 		switch (c) {
 		case 'j':
 			cfg_join = true;
@@ -1108,6 +1154,9 @@ static void parse_opts(int argc, char **argv)
 		case 'c':
 			parse_cmsg_types(optarg);
 			break;
+		case 'o':
+			parse_setsock_options(optarg);
+			break;
 		}
 	}
 
diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
index 559173a8e387..a4226b608c68 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
@@ -671,6 +671,82 @@ run_tests()
 	run_tests_lo $1 $2 $3 0
 }
 
+run_test_transparent()
+{
+	local connect_addr="$1"
+	local msg="$2"
+
+	local connector_ns="$ns1"
+	local listener_ns="$ns2"
+	local lret=0
+	local r6flag=""
+
+	# skip if we don't want v6
+	if ! $ipv6 && is_v6 "${connect_addr}"; then
+		return 0
+	fi
+
+ip netns exec "$listener_ns" nft -f /dev/stdin <<"EOF"
+flush ruleset
+table inet mangle {
+	chain divert {
+		type filter hook prerouting priority -150;
+
+		meta l4proto tcp socket transparent 1 meta mark set 1 accept
+		tcp dport 20000 tproxy to :20000 meta mark set 1 accept
+	}
+}
+EOF
+	if [ $? -ne 0 ]; then
+		echo "SKIP: $msg, could not load nft ruleset"
+		return
+	fi
+
+	local local_addr
+	if is_v6 "${connect_addr}"; then
+		local_addr="::"
+		r6flag="-6"
+	else
+		local_addr="0.0.0.0"
+	fi
+
+	ip -net "$listener_ns" $r6flag rule add fwmark 1 lookup 100
+	if [ $? -ne 0 ]; then
+		ip netns exec "$listener_ns" nft flush ruleset
+		echo "SKIP: $msg, ip $r6flag rule failed"
+		return
+	fi
+
+	ip -net "$listener_ns" route add local $local_addr/0 dev lo table 100
+	if [ $? -ne 0 ]; then
+		ip netns exec "$listener_ns" nft flush ruleset
+		ip -net "$listener_ns" $r6flag rule del fwmark 1 lookup 100
+		echo "SKIP: $msg, ip route add local $local_addr failed"
+		return
+	fi
+
+	echo "INFO: test $msg"
+
+	TEST_COUNT=10000
+	local extra_args="-o TRANSPARENT"
+	do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP \
+		    ${connect_addr} ${local_addr} "${extra_args}"
+	lret=$?
+
+	ip netns exec "$listener_ns" nft flush ruleset
+	ip -net "$listener_ns" $r6flag rule del fwmark 1 lookup 100
+	ip -net "$listener_ns" route del local $local_addr/0 dev lo table 100
+
+	if [ $lret -ne 0 ]; then
+		echo "FAIL: $msg, mptcp connection error" 1>&2
+		ret=$lret
+		return 1
+	fi
+
+	echo "PASS: $msg"
+	return 0
+}
+
 run_tests_peekmode()
 {
 	local peekmode="$1"
@@ -794,5 +870,9 @@ run_tests_peekmode "saveWithPeek"
 run_tests_peekmode "saveAfterPeek"
 stop_if_error "Tests with peek mode have failed"
 
+# connect to ns4 ip address, ns2 should intercept/proxy
+run_test_transparent 10.0.3.1 "tproxy ipv4"
+run_test_transparent dead:beef:3::1 "tproxy ipv6"
+
 display_time
 exit $ret
-- 
cgit v1.2.3


From bdf1565fe03d29777d24e239163d0d53e4af9ce0 Mon Sep 17 00:00:00 2001
From: Li Zhijian <zhijianx.li@intel.com>
Date: Fri, 19 Nov 2021 14:24:56 +0800
Subject: selftests/tc-testing: match any qdisc type

We should not always presume all kernels use pfifo_fast as the default qdisc.

For example, a fq_codel qdisk could have below output:
qdisc fq_codel 0: parent 1:4 limit 10240p flows 1024 quantum 1514 target 5ms interval 100ms memory_limit 32Mb ecn drop_batch 64

Reported-by: kernel test robot <lkp@intel.com>
Suggested-by: Peilin Ye <peilin.ye@bytedance.com>
Signed-off-by: Li Zhijian <zhijianx.li@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/tc-testing/tc-tests/qdiscs/mq.json | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/mq.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/mq.json
index 88a20c781e49..c6046096d9db 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/mq.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/mq.json
@@ -15,7 +15,7 @@
 	    "cmdUnderTest": "$TC qdisc add dev $ETH root handle 1: mq",
 	    "expExitCode": "0",
 	    "verifyCmd": "$TC qdisc show dev $ETH",
-	    "matchPattern": "qdisc pfifo_fast 0: parent 1:[1-4] bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1",
+	    "matchPattern": "qdisc [a-zA-Z0-9_]+ 0: parent 1:[1-4]",
 	    "matchCount": "4",
 	    "teardown": [
 		    "echo \"1\" > /sys/bus/netdevsim/del_device"
@@ -37,7 +37,7 @@
 	    "cmdUnderTest": "$TC qdisc add dev $ETH root handle 1: mq",
 	    "expExitCode": "0",
 	    "verifyCmd": "$TC qdisc show dev $ETH",
-	    "matchPattern": "qdisc pfifo_fast 0: parent 1:[1-9,a-f][0-9,a-f]{0,2} bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1",
+	    "matchPattern": "qdisc [a-zA-Z0-9_]+ 0: parent 1:[1-9,a-f][0-9,a-f]{0,2}",
 	    "matchCount": "256",
 	    "teardown": [
 		    "echo \"1\" > /sys/bus/netdevsim/del_device"
@@ -60,7 +60,7 @@
 	    "cmdUnderTest": "$TC qdisc add dev $ETH root handle 1: mq",
 	    "expExitCode": "2",
 	    "verifyCmd": "$TC qdisc show dev $ETH",
-	    "matchPattern": "qdisc pfifo_fast 0: parent 1:[1-4] bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1",
+	    "matchPattern": "qdisc [a-zA-Z0-9_]+ 0: parent 1:[1-4]",
 	    "matchCount": "4",
 	    "teardown": [
 		    "echo \"1\" > /sys/bus/netdevsim/del_device"
@@ -82,7 +82,7 @@
 	    "cmdUnderTest": "$TC qdisc del dev $ETH root handle 1: mq",
 	    "expExitCode": "2",
 	    "verifyCmd": "$TC qdisc show dev $ETH",
-	    "matchPattern": "qdisc pfifo_fast 0: parent 1:[1-4] bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1",
+	    "matchPattern": "qdisc [a-zA-Z0-9_]+ 0: parent 1:[1-4]",
 	    "matchCount": "0",
 	    "teardown": [
 		    "echo \"1\" > /sys/bus/netdevsim/del_device"
@@ -106,7 +106,7 @@
 	    "cmdUnderTest": "$TC qdisc del dev $ETH root handle 1: mq",
 	    "expExitCode": "2",
 	    "verifyCmd": "$TC qdisc show dev $ETH",
-	    "matchPattern": "qdisc pfifo_fast 0: parent 1:[1-4] bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1",
+	    "matchPattern": "qdisc [a-zA-Z0-9_]+ 0: parent 1:[1-4]",
 	    "matchCount": "0",
 	    "teardown": [
 		    "echo \"1\" > /sys/bus/netdevsim/del_device"
@@ -128,7 +128,7 @@
 	    "cmdUnderTest": "$TC qdisc add dev $ETH root handle 1: mq",
 	    "expExitCode": "2",
 	    "verifyCmd": "$TC qdisc show dev $ETH",
-	    "matchPattern": "qdisc pfifo_fast 0: parent 1:[1-4] bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1",
+	    "matchPattern": "qdisc [a-zA-Z0-9_]+ 0: parent 1:[1-4]",
 	    "matchCount": "0",
 	    "teardown": [
 		    "echo \"1\" > /sys/bus/netdevsim/del_device"
-- 
cgit v1.2.3


From ac2944abe4d7732f29a79f063c9cae7df2a3e3cc Mon Sep 17 00:00:00 2001
From: Li Zhijian <zhijianx.li@intel.com>
Date: Fri, 19 Nov 2021 14:24:57 +0800
Subject: selftests/tc-testings: Be compatible with newer tc output

old tc(iproute2-5.9.0) output:
 action order 1: bpf action.o:[action-ok] id 60 tag bcf7977d3b93787c jited default-action pipe
newer tc(iproute2-5.14.0) output:
 action order 1: bpf action.o:[action-ok] id 64 name tag bcf7977d3b93787c jited default-action pipe

It can fix below errors:
 # ok 260 f84a - Add cBPF action with invalid bytecode
 # not ok 261 e939 - Add eBPF action with valid object-file
 #       Could not match regex pattern. Verify command output:
 # total acts 0
 #
 #       action order 1: bpf action.o:[action-ok] id 42 name  tag bcf7977d3b93787c jited default-action pipe
 #        index 667 ref 1 bind 0

Signed-off-by: Li Zhijian <zhijianx.li@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/tc-testing/tc-tests/actions/bpf.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/bpf.json b/tools/testing/selftests/tc-testing/tc-tests/actions/bpf.json
index 503982b8f295..91832400ddbd 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/actions/bpf.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/bpf.json
@@ -68,7 +68,7 @@
         "cmdUnderTest": "$TC action add action bpf object-file $EBPFDIR/action.o section action-ok index 667",
         "expExitCode": "0",
         "verifyCmd": "$TC action get action bpf index 667",
-        "matchPattern": "action order [0-9]*: bpf action.o:\\[action-ok\\] id [0-9]* tag [0-9a-f]{16}( jited)? default-action pipe.*index 667 ref",
+        "matchPattern": "action order [0-9]*: bpf action.o:\\[action-ok\\] id [0-9].* tag [0-9a-f]{16}( jited)? default-action pipe.*index 667 ref",
         "matchCount": "1",
         "teardown": [
             "$TC action flush action bpf"
-- 
cgit v1.2.3


From 02ebe49ab06111a887202959e4d58a3cf252b5ee Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Mon, 22 Nov 2021 17:15:14 +0200
Subject: selftests: net: fib_nexthops: add test for group refcount imbalance
 bug

The new selftest runs a sequence which causes circular refcount
dependency between deleted objects which cannot be released and results
in a netdevice refcount imbalance.

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/fib_nexthops.sh | 63 +++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh
index b5a69ad191b0..d444ee6aa3cb 100755
--- a/tools/testing/selftests/net/fib_nexthops.sh
+++ b/tools/testing/selftests/net/fib_nexthops.sh
@@ -629,6 +629,66 @@ ipv6_fcnal()
 	log_test $? 0 "Nexthops removed on admin down"
 }
 
+ipv6_grp_refs()
+{
+	if [ ! -x "$(command -v mausezahn)" ]; then
+		echo "SKIP: Could not run test; need mausezahn tool"
+		return
+	fi
+
+	run_cmd "$IP link set dev veth1 up"
+	run_cmd "$IP link add veth1.10 link veth1 up type vlan id 10"
+	run_cmd "$IP link add veth1.20 link veth1 up type vlan id 20"
+	run_cmd "$IP -6 addr add 2001:db8:91::1/64 dev veth1.10"
+	run_cmd "$IP -6 addr add 2001:db8:92::1/64 dev veth1.20"
+	run_cmd "$IP -6 neigh add 2001:db8:91::2 lladdr 00:11:22:33:44:55 dev veth1.10"
+	run_cmd "$IP -6 neigh add 2001:db8:92::2 lladdr 00:11:22:33:44:55 dev veth1.20"
+	run_cmd "$IP nexthop add id 100 via 2001:db8:91::2 dev veth1.10"
+	run_cmd "$IP nexthop add id 101 via 2001:db8:92::2 dev veth1.20"
+	run_cmd "$IP nexthop add id 102 group 100"
+	run_cmd "$IP route add 2001:db8:101::1/128 nhid 102"
+
+	# create per-cpu dsts through nh 100
+	run_cmd "ip netns exec me mausezahn -6 veth1.10 -B 2001:db8:101::1 -A 2001:db8:91::1 -c 5 -t tcp "dp=1-1023, flags=syn" >/dev/null 2>&1"
+
+	# remove nh 100 from the group to delete the route potentially leaving
+	# a stale per-cpu dst which holds a reference to the nexthop's net
+	# device and to the IPv6 route
+	run_cmd "$IP nexthop replace id 102 group 101"
+	run_cmd "$IP route del 2001:db8:101::1/128"
+
+	# add both nexthops to the group so a reference is taken on them
+	run_cmd "$IP nexthop replace id 102 group 100/101"
+
+	# if the bug described in commit "net: nexthop: release IPv6 per-cpu
+	# dsts when replacing a nexthop group" exists at this point we have
+	# an unlinked IPv6 route (but not freed due to stale dst) with a
+	# reference over the group so we delete the group which will again
+	# only unlink it due to the route reference
+	run_cmd "$IP nexthop del id 102"
+
+	# delete the nexthop with stale dst, since we have an unlinked
+	# group with a ref to it and an unlinked IPv6 route with ref to the
+	# group, the nh will only be unlinked and not freed so the stale dst
+	# remains forever and we get a net device refcount imbalance
+	run_cmd "$IP nexthop del id 100"
+
+	# if a reference was lost this command will hang because the net device
+	# cannot be removed
+	timeout -s KILL 5 ip netns exec me ip link del veth1.10 >/dev/null 2>&1
+
+	# we can't cleanup if the command is hung trying to delete the netdev
+	if [ $? -eq 137 ]; then
+		return 1
+	fi
+
+	# cleanup
+	run_cmd "$IP link del veth1.20"
+	run_cmd "$IP nexthop flush"
+
+	return 0
+}
+
 ipv6_grp_fcnal()
 {
 	local rc
@@ -734,6 +794,9 @@ ipv6_grp_fcnal()
 
 	run_cmd "$IP nexthop add id 108 group 31/24"
 	log_test $? 2 "Nexthop group can not have a blackhole and another nexthop"
+
+	ipv6_grp_refs
+	log_test $? 0 "Nexthop group replace refcounts"
 }
 
 ipv6_res_grp_fcnal()
-- 
cgit v1.2.3


From fa721d4f0b91f525339996f4faef7bb072d70162 Mon Sep 17 00:00:00 2001
From: Drew Fustini <dfustini@baylibre.com>
Date: Sun, 21 Nov 2021 23:05:30 -0800
Subject: selftests/bpf: Fix trivial typo

Fix trivial typo in comment from 'oveflow' to 'overflow'.

Reported-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Drew Fustini <dfustini@baylibre.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211122070528.837806-1-dfustini@baylibre.com
---
 tools/testing/selftests/bpf/prog_tests/btf_dump.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
index d6272013a5a3..af47aeb211e7 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
@@ -756,7 +756,7 @@ static void test_btf_dump_struct_data(struct btf *btf, struct btf_dump *d,
 	/* overflow bpf_sock_ops struct with final element nonzero/zero.
 	 * Regardless of the value of the final field, we don't have all the
 	 * data we need to display it, so we should trigger an overflow.
-	 * In other words oveflow checking should trump "is field zero?"
+	 * In other words overflow checking should trump "is field zero?"
 	 * checks because if we've overflowed, it shouldn't matter what the
 	 * field is - we can't trust its value so shouldn't display it.
 	 */
-- 
cgit v1.2.3


From 710d5835b7ae17c517f5aeaff853c300959c87ec Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 22 Nov 2021 10:48:10 -0800
Subject: tools: sync uapi/linux/if_link.h header

This file has not been updated for a while.

Sync it before BIG TCP patch series.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20211122184810.769159-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/include/uapi/linux/if_link.h | 293 ++++++++++++++++++++++++++++++++-----
 1 file changed, 259 insertions(+), 34 deletions(-)

(limited to 'tools')

diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index b3610fdd1fee..eebd3894fe89 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -7,24 +7,23 @@
 
 /* This struct should be in sync with struct rtnl_link_stats64 */
 struct rtnl_link_stats {
-	__u32	rx_packets;		/* total packets received	*/
-	__u32	tx_packets;		/* total packets transmitted	*/
-	__u32	rx_bytes;		/* total bytes received 	*/
-	__u32	tx_bytes;		/* total bytes transmitted	*/
-	__u32	rx_errors;		/* bad packets received		*/
-	__u32	tx_errors;		/* packet transmit problems	*/
-	__u32	rx_dropped;		/* no space in linux buffers	*/
-	__u32	tx_dropped;		/* no space available in linux	*/
-	__u32	multicast;		/* multicast packets received	*/
+	__u32	rx_packets;
+	__u32	tx_packets;
+	__u32	rx_bytes;
+	__u32	tx_bytes;
+	__u32	rx_errors;
+	__u32	tx_errors;
+	__u32	rx_dropped;
+	__u32	tx_dropped;
+	__u32	multicast;
 	__u32	collisions;
-
 	/* detailed rx_errors: */
 	__u32	rx_length_errors;
-	__u32	rx_over_errors;		/* receiver ring buff overflow	*/
-	__u32	rx_crc_errors;		/* recved pkt with crc error	*/
-	__u32	rx_frame_errors;	/* recv'd frame alignment error */
-	__u32	rx_fifo_errors;		/* recv'r fifo overrun		*/
-	__u32	rx_missed_errors;	/* receiver missed packet	*/
+	__u32	rx_over_errors;
+	__u32	rx_crc_errors;
+	__u32	rx_frame_errors;
+	__u32	rx_fifo_errors;
+	__u32	rx_missed_errors;
 
 	/* detailed tx_errors */
 	__u32	tx_aborted_errors;
@@ -37,29 +36,201 @@ struct rtnl_link_stats {
 	__u32	rx_compressed;
 	__u32	tx_compressed;
 
-	__u32	rx_nohandler;		/* dropped, no handler found	*/
+	__u32	rx_nohandler;
 };
 
-/* The main device statistics structure */
+/**
+ * struct rtnl_link_stats64 - The main device statistics structure.
+ *
+ * @rx_packets: Number of good packets received by the interface.
+ *   For hardware interfaces counts all good packets received from the device
+ *   by the host, including packets which host had to drop at various stages
+ *   of processing (even in the driver).
+ *
+ * @tx_packets: Number of packets successfully transmitted.
+ *   For hardware interfaces counts packets which host was able to successfully
+ *   hand over to the device, which does not necessarily mean that packets
+ *   had been successfully transmitted out of the device, only that device
+ *   acknowledged it copied them out of host memory.
+ *
+ * @rx_bytes: Number of good received bytes, corresponding to @rx_packets.
+ *
+ *   For IEEE 802.3 devices should count the length of Ethernet Frames
+ *   excluding the FCS.
+ *
+ * @tx_bytes: Number of good transmitted bytes, corresponding to @tx_packets.
+ *
+ *   For IEEE 802.3 devices should count the length of Ethernet Frames
+ *   excluding the FCS.
+ *
+ * @rx_errors: Total number of bad packets received on this network device.
+ *   This counter must include events counted by @rx_length_errors,
+ *   @rx_crc_errors, @rx_frame_errors and other errors not otherwise
+ *   counted.
+ *
+ * @tx_errors: Total number of transmit problems.
+ *   This counter must include events counter by @tx_aborted_errors,
+ *   @tx_carrier_errors, @tx_fifo_errors, @tx_heartbeat_errors,
+ *   @tx_window_errors and other errors not otherwise counted.
+ *
+ * @rx_dropped: Number of packets received but not processed,
+ *   e.g. due to lack of resources or unsupported protocol.
+ *   For hardware interfaces this counter may include packets discarded
+ *   due to L2 address filtering but should not include packets dropped
+ *   by the device due to buffer exhaustion which are counted separately in
+ *   @rx_missed_errors (since procfs folds those two counters together).
+ *
+ * @tx_dropped: Number of packets dropped on their way to transmission,
+ *   e.g. due to lack of resources.
+ *
+ * @multicast: Multicast packets received.
+ *   For hardware interfaces this statistic is commonly calculated
+ *   at the device level (unlike @rx_packets) and therefore may include
+ *   packets which did not reach the host.
+ *
+ *   For IEEE 802.3 devices this counter may be equivalent to:
+ *
+ *    - 30.3.1.1.21 aMulticastFramesReceivedOK
+ *
+ * @collisions: Number of collisions during packet transmissions.
+ *
+ * @rx_length_errors: Number of packets dropped due to invalid length.
+ *   Part of aggregate "frame" errors in `/proc/net/dev`.
+ *
+ *   For IEEE 802.3 devices this counter should be equivalent to a sum
+ *   of the following attributes:
+ *
+ *    - 30.3.1.1.23 aInRangeLengthErrors
+ *    - 30.3.1.1.24 aOutOfRangeLengthField
+ *    - 30.3.1.1.25 aFrameTooLongErrors
+ *
+ * @rx_over_errors: Receiver FIFO overflow event counter.
+ *
+ *   Historically the count of overflow events. Such events may be
+ *   reported in the receive descriptors or via interrupts, and may
+ *   not correspond one-to-one with dropped packets.
+ *
+ *   The recommended interpretation for high speed interfaces is -
+ *   number of packets dropped because they did not fit into buffers
+ *   provided by the host, e.g. packets larger than MTU or next buffer
+ *   in the ring was not available for a scatter transfer.
+ *
+ *   Part of aggregate "frame" errors in `/proc/net/dev`.
+ *
+ *   This statistics was historically used interchangeably with
+ *   @rx_fifo_errors.
+ *
+ *   This statistic corresponds to hardware events and is not commonly used
+ *   on software devices.
+ *
+ * @rx_crc_errors: Number of packets received with a CRC error.
+ *   Part of aggregate "frame" errors in `/proc/net/dev`.
+ *
+ *   For IEEE 802.3 devices this counter must be equivalent to:
+ *
+ *    - 30.3.1.1.6 aFrameCheckSequenceErrors
+ *
+ * @rx_frame_errors: Receiver frame alignment errors.
+ *   Part of aggregate "frame" errors in `/proc/net/dev`.
+ *
+ *   For IEEE 802.3 devices this counter should be equivalent to:
+ *
+ *    - 30.3.1.1.7 aAlignmentErrors
+ *
+ * @rx_fifo_errors: Receiver FIFO error counter.
+ *
+ *   Historically the count of overflow events. Those events may be
+ *   reported in the receive descriptors or via interrupts, and may
+ *   not correspond one-to-one with dropped packets.
+ *
+ *   This statistics was used interchangeably with @rx_over_errors.
+ *   Not recommended for use in drivers for high speed interfaces.
+ *
+ *   This statistic is used on software devices, e.g. to count software
+ *   packet queue overflow (can) or sequencing errors (GRE).
+ *
+ * @rx_missed_errors: Count of packets missed by the host.
+ *   Folded into the "drop" counter in `/proc/net/dev`.
+ *
+ *   Counts number of packets dropped by the device due to lack
+ *   of buffer space. This usually indicates that the host interface
+ *   is slower than the network interface, or host is not keeping up
+ *   with the receive packet rate.
+ *
+ *   This statistic corresponds to hardware events and is not used
+ *   on software devices.
+ *
+ * @tx_aborted_errors:
+ *   Part of aggregate "carrier" errors in `/proc/net/dev`.
+ *   For IEEE 802.3 devices capable of half-duplex operation this counter
+ *   must be equivalent to:
+ *
+ *    - 30.3.1.1.11 aFramesAbortedDueToXSColls
+ *
+ *   High speed interfaces may use this counter as a general device
+ *   discard counter.
+ *
+ * @tx_carrier_errors: Number of frame transmission errors due to loss
+ *   of carrier during transmission.
+ *   Part of aggregate "carrier" errors in `/proc/net/dev`.
+ *
+ *   For IEEE 802.3 devices this counter must be equivalent to:
+ *
+ *    - 30.3.1.1.13 aCarrierSenseErrors
+ *
+ * @tx_fifo_errors: Number of frame transmission errors due to device
+ *   FIFO underrun / underflow. This condition occurs when the device
+ *   begins transmission of a frame but is unable to deliver the
+ *   entire frame to the transmitter in time for transmission.
+ *   Part of aggregate "carrier" errors in `/proc/net/dev`.
+ *
+ * @tx_heartbeat_errors: Number of Heartbeat / SQE Test errors for
+ *   old half-duplex Ethernet.
+ *   Part of aggregate "carrier" errors in `/proc/net/dev`.
+ *
+ *   For IEEE 802.3 devices possibly equivalent to:
+ *
+ *    - 30.3.2.1.4 aSQETestErrors
+ *
+ * @tx_window_errors: Number of frame transmission errors due
+ *   to late collisions (for Ethernet - after the first 64B of transmission).
+ *   Part of aggregate "carrier" errors in `/proc/net/dev`.
+ *
+ *   For IEEE 802.3 devices this counter must be equivalent to:
+ *
+ *    - 30.3.1.1.10 aLateCollisions
+ *
+ * @rx_compressed: Number of correctly received compressed packets.
+ *   This counters is only meaningful for interfaces which support
+ *   packet compression (e.g. CSLIP, PPP).
+ *
+ * @tx_compressed: Number of transmitted compressed packets.
+ *   This counters is only meaningful for interfaces which support
+ *   packet compression (e.g. CSLIP, PPP).
+ *
+ * @rx_nohandler: Number of packets received on the interface
+ *   but dropped by the networking stack because the device is
+ *   not designated to receive packets (e.g. backup link in a bond).
+ */
 struct rtnl_link_stats64 {
-	__u64	rx_packets;		/* total packets received	*/
-	__u64	tx_packets;		/* total packets transmitted	*/
-	__u64	rx_bytes;		/* total bytes received 	*/
-	__u64	tx_bytes;		/* total bytes transmitted	*/
-	__u64	rx_errors;		/* bad packets received		*/
-	__u64	tx_errors;		/* packet transmit problems	*/
-	__u64	rx_dropped;		/* no space in linux buffers	*/
-	__u64	tx_dropped;		/* no space available in linux	*/
-	__u64	multicast;		/* multicast packets received	*/
+	__u64	rx_packets;
+	__u64	tx_packets;
+	__u64	rx_bytes;
+	__u64	tx_bytes;
+	__u64	rx_errors;
+	__u64	tx_errors;
+	__u64	rx_dropped;
+	__u64	tx_dropped;
+	__u64	multicast;
 	__u64	collisions;
 
 	/* detailed rx_errors: */
 	__u64	rx_length_errors;
-	__u64	rx_over_errors;		/* receiver ring buff overflow	*/
-	__u64	rx_crc_errors;		/* recved pkt with crc error	*/
-	__u64	rx_frame_errors;	/* recv'd frame alignment error */
-	__u64	rx_fifo_errors;		/* recv'r fifo overrun		*/
-	__u64	rx_missed_errors;	/* receiver missed packet	*/
+	__u64	rx_over_errors;
+	__u64	rx_crc_errors;
+	__u64	rx_frame_errors;
+	__u64	rx_fifo_errors;
+	__u64	rx_missed_errors;
 
 	/* detailed tx_errors */
 	__u64	tx_aborted_errors;
@@ -71,8 +242,7 @@ struct rtnl_link_stats64 {
 	/* for cslip etc */
 	__u64	rx_compressed;
 	__u64	tx_compressed;
-
-	__u64	rx_nohandler;		/* dropped, no handler found	*/
+	__u64	rx_nohandler;
 };
 
 /* The struct should be in sync with struct ifmap */
@@ -170,12 +340,29 @@ enum {
 	IFLA_PROP_LIST,
 	IFLA_ALT_IFNAME, /* Alternative ifname */
 	IFLA_PERM_ADDRESS,
+	IFLA_PROTO_DOWN_REASON,
+
+	/* device (sysfs) name as parent, used instead
+	 * of IFLA_LINK where there's no parent netdev
+	 */
+	IFLA_PARENT_DEV_NAME,
+	IFLA_PARENT_DEV_BUS_NAME,
+
 	__IFLA_MAX
 };
 
 
 #define IFLA_MAX (__IFLA_MAX - 1)
 
+enum {
+	IFLA_PROTO_DOWN_REASON_UNSPEC,
+	IFLA_PROTO_DOWN_REASON_MASK,	/* u32, mask for reason bits */
+	IFLA_PROTO_DOWN_REASON_VALUE,   /* u32, reason bit value */
+
+	__IFLA_PROTO_DOWN_REASON_CNT,
+	IFLA_PROTO_DOWN_REASON_MAX = __IFLA_PROTO_DOWN_REASON_CNT - 1
+};
+
 /* backwards compatibility for userspace */
 #ifndef __KERNEL__
 #define IFLA_RTA(r)  ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct ifinfomsg))))
@@ -293,6 +480,7 @@ enum {
 	IFLA_BR_MCAST_MLD_VERSION,
 	IFLA_BR_VLAN_STATS_PER_PORT,
 	IFLA_BR_MULTI_BOOLOPT,
+	IFLA_BR_MCAST_QUERIER_STATE,
 	__IFLA_BR_MAX,
 };
 
@@ -346,6 +534,8 @@ enum {
 	IFLA_BRPORT_BACKUP_PORT,
 	IFLA_BRPORT_MRP_RING_OPEN,
 	IFLA_BRPORT_MRP_IN_OPEN,
+	IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT,
+	IFLA_BRPORT_MCAST_EHT_HOSTS_CNT,
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
@@ -433,6 +623,7 @@ enum macvlan_macaddr_mode {
 };
 
 #define MACVLAN_FLAG_NOPROMISC	1
+#define MACVLAN_FLAG_NODST	2 /* skip dst macvlan if matching src macvlan */
 
 /* VRF section */
 enum {
@@ -597,6 +788,18 @@ enum ifla_geneve_df {
 	GENEVE_DF_MAX = __GENEVE_DF_END - 1,
 };
 
+/* Bareudp section  */
+enum {
+	IFLA_BAREUDP_UNSPEC,
+	IFLA_BAREUDP_PORT,
+	IFLA_BAREUDP_ETHERTYPE,
+	IFLA_BAREUDP_SRCPORT_MIN,
+	IFLA_BAREUDP_MULTIPROTO_MODE,
+	__IFLA_BAREUDP_MAX
+};
+
+#define IFLA_BAREUDP_MAX (__IFLA_BAREUDP_MAX - 1)
+
 /* PPP section */
 enum {
 	IFLA_PPP_UNSPEC,
@@ -899,7 +1102,14 @@ enum {
 #define IFLA_IPOIB_MAX (__IFLA_IPOIB_MAX - 1)
 
 
-/* HSR section */
+/* HSR/PRP section, both uses same interface */
+
+/* Different redundancy protocols for hsr device */
+enum {
+	HSR_PROTOCOL_HSR,
+	HSR_PROTOCOL_PRP,
+	HSR_PROTOCOL_MAX,
+};
 
 enum {
 	IFLA_HSR_UNSPEC,
@@ -909,6 +1119,9 @@ enum {
 	IFLA_HSR_SUPERVISION_ADDR,	/* Supervision frame multicast addr */
 	IFLA_HSR_SEQ_NR,
 	IFLA_HSR_VERSION,		/* HSR version */
+	IFLA_HSR_PROTOCOL,		/* Indicate different protocol than
+					 * HSR. For example PRP.
+					 */
 	__IFLA_HSR_MAX,
 };
 
@@ -1033,6 +1246,8 @@ enum {
 #define RMNET_FLAGS_INGRESS_MAP_COMMANDS          (1U << 1)
 #define RMNET_FLAGS_INGRESS_MAP_CKSUMV4           (1U << 2)
 #define RMNET_FLAGS_EGRESS_MAP_CKSUMV4            (1U << 3)
+#define RMNET_FLAGS_INGRESS_MAP_CKSUMV5           (1U << 4)
+#define RMNET_FLAGS_EGRESS_MAP_CKSUMV5            (1U << 5)
 
 enum {
 	IFLA_RMNET_UNSPEC,
@@ -1048,4 +1263,14 @@ struct ifla_rmnet_flags {
 	__u32	mask;
 };
 
+/* MCTP section */
+
+enum {
+	IFLA_MCTP_UNSPEC,
+	IFLA_MCTP_NET,
+	__IFLA_MCTP_MAX,
+};
+
+#define IFLA_MCTP_MAX (__IFLA_MCTP_MAX - 1)
+
 #endif /* _UAPI_LINUX_IF_LINK_H */
-- 
cgit v1.2.3


From 619ca0d0108a5866cca7793bd7e19c5d7a30e750 Mon Sep 17 00:00:00 2001
From: James Prestwood <prestwoj@gmail.com>
Date: Mon, 22 Nov 2021 09:18:06 -0800
Subject: selftests: add arp_ndisc_evict_nocarrier to Makefile

This was previously added in selftests but never added
to the Makefile

Signed-off-by: James Prestwood <prestwoj@gmail.com>
Link: https://lore.kernel.org/r/20211122171806.3529401-1-prestwoj@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/Makefile | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 7615f29831eb..9897fa9ab953 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -34,6 +34,7 @@ TEST_PROGS += srv6_end_dt46_l3vpn_test.sh
 TEST_PROGS += srv6_end_dt4_l3vpn_test.sh
 TEST_PROGS += srv6_end_dt6_l3vpn_test.sh
 TEST_PROGS += vrf_strict_mode_test.sh
+TEST_PROGS += arp_ndisc_evict_nocarrier.sh
 TEST_PROGS_EXTENDED := in_netns.sh setup_loopback.sh setup_veth.sh
 TEST_PROGS_EXTENDED += toeplitz_client.sh toeplitz.sh
 TEST_GEN_FILES =  socket nettest
-- 
cgit v1.2.3


From e7049395b1c3085d12b5ba16d058c65598368853 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Mon, 22 Nov 2021 19:16:21 +0900
Subject: dccp/tcp: Remove an unused argument in inet_csk_listen_start().

The commit 1295e2cf3065 ("inet: minor optimization for backlog setting in
listen(2)") added change so that sk_max_ack_backlog is initialised earlier
in inet_dccp_listen() and inet_listen().  Since then, we no longer use
backlog in inet_csk_listen_start(), so let's remove it.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Acked-by: Yafang Shao <laoar.shao@gmail.com>
Reviewed-by: Richard Sailer <richard_siegfried@systemli.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/inet_connection_sock.h                          | 2 +-
 net/dccp/proto.c                                            | 6 +++---
 net/ipv4/af_inet.c                                          | 2 +-
 net/ipv4/inet_connection_sock.c                             | 2 +-
 tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index fa6a87246a7b..4ad47d9f9d27 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -304,7 +304,7 @@ static inline __poll_t inet_csk_listen_poll(const struct sock *sk)
 			(EPOLLIN | EPOLLRDNORM) : 0;
 }
 
-int inet_csk_listen_start(struct sock *sk, int backlog);
+int inet_csk_listen_start(struct sock *sk);
 void inet_csk_listen_stop(struct sock *sk);
 
 void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr);
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index fc44dadc778b..55b8f958cdd2 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -238,7 +238,7 @@ void dccp_destroy_sock(struct sock *sk)
 
 EXPORT_SYMBOL_GPL(dccp_destroy_sock);
 
-static inline int dccp_listen_start(struct sock *sk, int backlog)
+static inline int dccp_listen_start(struct sock *sk)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 
@@ -246,7 +246,7 @@ static inline int dccp_listen_start(struct sock *sk, int backlog)
 	/* do not start to listen if feature negotiation setup fails */
 	if (dccp_feat_finalise_settings(dp))
 		return -EPROTO;
-	return inet_csk_listen_start(sk, backlog);
+	return inet_csk_listen_start(sk);
 }
 
 static inline int dccp_need_reset(int state)
@@ -935,7 +935,7 @@ int inet_dccp_listen(struct socket *sock, int backlog)
 		 * FIXME: here it probably should be sk->sk_prot->listen_start
 		 * see tcp_listen_start
 		 */
-		err = dccp_listen_start(sk, backlog);
+		err = dccp_listen_start(sk);
 		if (err)
 			goto out;
 	}
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index c66b0563a267..aa8d15b1e2a4 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -225,7 +225,7 @@ int inet_listen(struct socket *sock, int backlog)
 			tcp_fastopen_init_key_once(sock_net(sk));
 		}
 
-		err = inet_csk_listen_start(sk, backlog);
+		err = inet_csk_listen_start(sk);
 		if (err)
 			goto out;
 		tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index f7fea3a7c5e6..23da67a3fc06 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -1035,7 +1035,7 @@ void inet_csk_prepare_forced_close(struct sock *sk)
 }
 EXPORT_SYMBOL(inet_csk_prepare_forced_close);
 
-int inet_csk_listen_start(struct sock *sk, int backlog)
+int inet_csk_listen_start(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct inet_sock *inet = inet_sk(sk);
diff --git a/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c b/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c
index 8e94e5c080aa..6dc1f28fc4b6 100644
--- a/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c
+++ b/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c
@@ -68,7 +68,7 @@ static void set_task_info(struct sock *sk)
 }
 
 SEC("fentry/inet_csk_listen_start")
-int BPF_PROG(trace_inet_csk_listen_start, struct sock *sk, int backlog)
+int BPF_PROG(trace_inet_csk_listen_start, struct sock *sk)
 {
 	set_task_info(sk);
 
-- 
cgit v1.2.3


From d7fbdc575b33c374ce88ccfe3ab364f7eb240f8a Mon Sep 17 00:00:00 2001
From: ran jianping <ran.jianping@zte.com.cn>
Date: Fri, 12 Nov 2021 09:09:46 +0000
Subject: thermal: tools: tmon: remove unneeded local variable

Fix the following coccicheck review:
/tools/thermal/tmon/pid.c:57:5-8: Unneeded variable

Remove unneeded variable used to store return value.

Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: ran jianping <ran.jianping@zte.com.cn>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 tools/thermal/tmon/pid.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/thermal/tmon/pid.c b/tools/thermal/tmon/pid.c
index c54edb4f630c..296f69c00c57 100644
--- a/tools/thermal/tmon/pid.c
+++ b/tools/thermal/tmon/pid.c
@@ -54,7 +54,6 @@ static double xk_1, xk_2; /* input temperature x[k-#] */
  */
 int init_thermal_controller(void)
 {
-	int ret = 0;
 
 	/* init pid params */
 	p_param.ts = ticktime;
@@ -65,7 +64,7 @@ int init_thermal_controller(void)
 
 	p_param.t_target = target_temp_user;
 
-	return ret;
+	return 0;
 }
 
 void controller_reset(void)
-- 
cgit v1.2.3


From 83762cb5c7c464af4cbaba5679af31c7fe534979 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 15 Nov 2021 13:20:57 -0800
Subject: dax: Kill DEV_DAX_PMEM_COMPAT

The /sys/class/dax compatibility option has shipped in the kernel for 4
years now which should be sufficient time for tools to abandon the old
ABI in favor of the /sys/bus/dax device-model. Delete it now and see if
anyone screams.

Since this compatibility option shipped there has been more reports of
users being surprised by the compat ABI than surprised by the "new", so
the compat infrastructure has outlived its usefulness. Recall that
/sys/bus/dax device-model is required for the dax kmem driver which
allows PMEM to be used as "System RAM".

The following projects were known to have a dependency on /sys/class/dax
and have dropped their dependency as of the listed version:

- ndctl (including libndctl, daxctl, and libdaxctl): v64+
- fio: v3.13+
- pmdk: v1.5.2+

As further evidence this option is no longer needed some distributions
have already stopped enabling CONFIG_DEV_DAX_PMEM_COMPAT.

Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Reported-by: Vishal Verma <vishal.l.verma@intel.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Jane Chu <jane.chu@oracle.com>
Link: https://lore.kernel.org/r/163701116195.3784476.726128179293466337.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 Documentation/ABI/obsolete/sysfs-class-dax  |  22 ------
 drivers/dax/Kconfig                         |   9 ---
 drivers/dax/Makefile                        |   3 +-
 drivers/dax/bus.c                           |  21 +-----
 drivers/dax/bus.h                           |  13 ----
 drivers/dax/device.c                        |   6 +-
 drivers/dax/pmem.c                          | 103 ++++++++++++++++++++++++++++
 drivers/dax/pmem/Makefile                   |   1 -
 drivers/dax/pmem/compat.c                   |  72 -------------------
 drivers/dax/pmem/core.c                     |  79 ---------------------
 drivers/dax/pmem/pmem.c                     |  30 --------
 tools/testing/nvdimm/Kbuild                 |   8 +--
 tools/testing/nvdimm/dax_pmem_compat_test.c |   8 ---
 tools/testing/nvdimm/dax_pmem_core_test.c   |   8 ---
 tools/testing/nvdimm/test/ndtest.c          |   4 --
 tools/testing/nvdimm/test/nfit.c            |   4 --
 16 files changed, 109 insertions(+), 282 deletions(-)
 delete mode 100644 Documentation/ABI/obsolete/sysfs-class-dax
 create mode 100644 drivers/dax/pmem.c
 delete mode 100644 drivers/dax/pmem/compat.c
 delete mode 100644 drivers/dax/pmem/core.c
 delete mode 100644 tools/testing/nvdimm/dax_pmem_compat_test.c
 delete mode 100644 tools/testing/nvdimm/dax_pmem_core_test.c

(limited to 'tools')

diff --git a/Documentation/ABI/obsolete/sysfs-class-dax b/Documentation/ABI/obsolete/sysfs-class-dax
deleted file mode 100644
index 5bcce27458e3..000000000000
--- a/Documentation/ABI/obsolete/sysfs-class-dax
+++ /dev/null
@@ -1,22 +0,0 @@
-What:           /sys/class/dax/
-Date:           May, 2016
-KernelVersion:  v4.7
-Contact:        nvdimm@lists.linux.dev
-Description:	Device DAX is the device-centric analogue of Filesystem
-		DAX (CONFIG_FS_DAX).  It allows memory ranges to be
-		allocated and mapped without need of an intervening file
-		system.  Device DAX is strict, precise and predictable.
-		Specifically this interface:
-
-		1. Guarantees fault granularity with respect to a given
-		   page size (pte, pmd, or pud) set at configuration time.
-
-		2. Enforces deterministic behavior by being strict about
-		   what fault scenarios are supported.
-
-		The /sys/class/dax/ interface enumerates all the
-		device-dax instances in the system. The ABI is
-		deprecated and will be removed after 2020. It is
-		replaced with the DAX bus interface /sys/bus/dax/ where
-		device-dax instances can be found under
-		/sys/bus/dax/devices/
diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
index d2834c2cfa10..15b442ef7b93 100644
--- a/drivers/dax/Kconfig
+++ b/drivers/dax/Kconfig
@@ -70,13 +70,4 @@ config DEV_DAX_KMEM
 
 	  Say N if unsure.
 
-config DEV_DAX_PMEM_COMPAT
-	tristate "PMEM DAX: support the deprecated /sys/class/dax interface"
-	depends on m && DEV_DAX_PMEM=m
-	default DEV_DAX_PMEM
-	help
-	  Older versions of the libdaxctl library expect to find all
-	  device-dax instances under /sys/class/dax. If libdaxctl in
-	  your distribution is older than v58 say M, otherwise say N.
-
 endif
diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile
index 9d4ba672d305..90a56ca3b345 100644
--- a/drivers/dax/Makefile
+++ b/drivers/dax/Makefile
@@ -2,10 +2,11 @@
 obj-$(CONFIG_DAX) += dax.o
 obj-$(CONFIG_DEV_DAX) += device_dax.o
 obj-$(CONFIG_DEV_DAX_KMEM) += kmem.o
+obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o
 
 dax-y := super.o
 dax-y += bus.o
 device_dax-y := device.o
+dax_pmem-y := pmem.o
 
-obj-y += pmem/
 obj-y += hmem/
diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 6cc4da4c713d..452cf7860926 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -10,8 +10,6 @@
 #include "dax-private.h"
 #include "bus.h"
 
-static struct class *dax_class;
-
 static DEFINE_MUTEX(dax_bus_lock);
 
 #define DAX_NAME_LEN 30
@@ -1343,10 +1341,7 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data)
 
 	inode = dax_inode(dax_dev);
 	dev->devt = inode->i_rdev;
-	if (data->subsys == DEV_DAX_BUS)
-		dev->bus = &dax_bus_type;
-	else
-		dev->class = dax_class;
+	dev->bus = &dax_bus_type;
 	dev->parent = parent;
 	dev->type = &dev_dax_type;
 
@@ -1445,22 +1440,10 @@ EXPORT_SYMBOL_GPL(dax_driver_unregister);
 
 int __init dax_bus_init(void)
 {
-	int rc;
-
-	if (IS_ENABLED(CONFIG_DEV_DAX_PMEM_COMPAT)) {
-		dax_class = class_create(THIS_MODULE, "dax");
-		if (IS_ERR(dax_class))
-			return PTR_ERR(dax_class);
-	}
-
-	rc = bus_register(&dax_bus_type);
-	if (rc)
-		class_destroy(dax_class);
-	return rc;
+	return bus_register(&dax_bus_type);
 }
 
 void __exit dax_bus_exit(void)
 {
 	bus_unregister(&dax_bus_type);
-	class_destroy(dax_class);
 }
diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h
index 1e946ad7780a..381cec9ff05c 100644
--- a/drivers/dax/bus.h
+++ b/drivers/dax/bus.h
@@ -16,24 +16,15 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id,
 		struct range *range, int target_node, unsigned int align,
 		unsigned long flags);
 
-enum dev_dax_subsys {
-	DEV_DAX_BUS = 0, /* zeroed dev_dax_data picks this by default */
-	DEV_DAX_CLASS,
-};
-
 struct dev_dax_data {
 	struct dax_region *dax_region;
 	struct dev_pagemap *pgmap;
-	enum dev_dax_subsys subsys;
 	resource_size_t size;
 	int id;
 };
 
 struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data);
 
-/* to be deleted when DEV_DAX_CLASS is removed */
-struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys);
-
 struct dax_device_driver {
 	struct device_driver drv;
 	struct list_head ids;
@@ -49,10 +40,6 @@ int __dax_driver_register(struct dax_device_driver *dax_drv,
 void dax_driver_unregister(struct dax_device_driver *dax_drv);
 void kill_dev_dax(struct dev_dax *dev_dax);
 
-#if IS_ENABLED(CONFIG_DEV_DAX_PMEM_COMPAT)
-int dev_dax_probe(struct dev_dax *dev_dax);
-#endif
-
 /*
  * While run_dax() is potentially a generic operation that could be
  * defined in include/linux/dax.h we don't want to grow any users
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index dd8222a42808..e58d597f0415 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -433,11 +433,7 @@ int dev_dax_probe(struct dev_dax *dev_dax)
 	inode = dax_inode(dax_dev);
 	cdev = inode->i_cdev;
 	cdev_init(cdev, &dax_fops);
-	if (dev->class) {
-		/* for the CONFIG_DEV_DAX_PMEM_COMPAT case */
-		cdev->owner = dev->parent->driver->owner;
-	} else
-		cdev->owner = dev->driver->owner;
+	cdev->owner = dev->driver->owner;
 	cdev_set_parent(cdev, &dev->kobj);
 	rc = cdev_add(cdev, dev->devt, 1);
 	if (rc)
diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c
new file mode 100644
index 000000000000..f050ea78bb83
--- /dev/null
+++ b/drivers/dax/pmem.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2016 - 2018 Intel Corporation. All rights reserved. */
+#include <linux/memremap.h>
+#include <linux/module.h>
+#include <linux/pfn_t.h>
+#include "../nvdimm/pfn.h"
+#include "../nvdimm/nd.h"
+#include "bus.h"
+
+static struct dev_dax *__dax_pmem_probe(struct device *dev)
+{
+	struct range range;
+	int rc, id, region_id;
+	resource_size_t offset;
+	struct nd_pfn_sb *pfn_sb;
+	struct dev_dax *dev_dax;
+	struct dev_dax_data data;
+	struct nd_namespace_io *nsio;
+	struct dax_region *dax_region;
+	struct dev_pagemap pgmap = { };
+	struct nd_namespace_common *ndns;
+	struct nd_dax *nd_dax = to_nd_dax(dev);
+	struct nd_pfn *nd_pfn = &nd_dax->nd_pfn;
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+
+	ndns = nvdimm_namespace_common_probe(dev);
+	if (IS_ERR(ndns))
+		return ERR_CAST(ndns);
+
+	/* parse the 'pfn' info block via ->rw_bytes */
+	rc = devm_namespace_enable(dev, ndns, nd_info_block_reserve());
+	if (rc)
+		return ERR_PTR(rc);
+	rc = nvdimm_setup_pfn(nd_pfn, &pgmap);
+	if (rc)
+		return ERR_PTR(rc);
+	devm_namespace_disable(dev, ndns);
+
+	/* reserve the metadata area, device-dax will reserve the data */
+	pfn_sb = nd_pfn->pfn_sb;
+	offset = le64_to_cpu(pfn_sb->dataoff);
+	nsio = to_nd_namespace_io(&ndns->dev);
+	if (!devm_request_mem_region(dev, nsio->res.start, offset,
+				dev_name(&ndns->dev))) {
+		dev_warn(dev, "could not reserve metadata\n");
+		return ERR_PTR(-EBUSY);
+	}
+
+	rc = sscanf(dev_name(&ndns->dev), "namespace%d.%d", &region_id, &id);
+	if (rc != 2)
+		return ERR_PTR(-EINVAL);
+
+	/* adjust the dax_region range to the start of data */
+	range = pgmap.range;
+	range.start += offset;
+	dax_region = alloc_dax_region(dev, region_id, &range,
+			nd_region->target_node, le32_to_cpu(pfn_sb->align),
+			IORESOURCE_DAX_STATIC);
+	if (!dax_region)
+		return ERR_PTR(-ENOMEM);
+
+	data = (struct dev_dax_data) {
+		.dax_region = dax_region,
+		.id = id,
+		.pgmap = &pgmap,
+		.size = range_len(&range),
+	};
+	dev_dax = devm_create_dev_dax(&data);
+
+	/* child dev_dax instances now own the lifetime of the dax_region */
+	dax_region_put(dax_region);
+
+	return dev_dax;
+}
+
+static int dax_pmem_probe(struct device *dev)
+{
+	return PTR_ERR_OR_ZERO(__dax_pmem_probe(dev));
+}
+
+static struct nd_device_driver dax_pmem_driver = {
+	.probe = dax_pmem_probe,
+	.drv = {
+		.name = "dax_pmem",
+	},
+	.type = ND_DRIVER_DAX_PMEM,
+};
+
+static int __init dax_pmem_init(void)
+{
+	return nd_driver_register(&dax_pmem_driver);
+}
+module_init(dax_pmem_init);
+
+static void __exit dax_pmem_exit(void)
+{
+	driver_unregister(&dax_pmem_driver.drv);
+}
+module_exit(dax_pmem_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Intel Corporation");
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_DAX_PMEM);
diff --git a/drivers/dax/pmem/Makefile b/drivers/dax/pmem/Makefile
index 010269f61d41..191c31f0d4f0 100644
--- a/drivers/dax/pmem/Makefile
+++ b/drivers/dax/pmem/Makefile
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o
 obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem_core.o
-obj-$(CONFIG_DEV_DAX_PMEM_COMPAT) += dax_pmem_compat.o
 
 dax_pmem-y := pmem.o
 dax_pmem_core-y := core.o
diff --git a/drivers/dax/pmem/compat.c b/drivers/dax/pmem/compat.c
deleted file mode 100644
index d81dc35fd65d..000000000000
--- a/drivers/dax/pmem/compat.c
+++ /dev/null
@@ -1,72 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright(c) 2016 - 2018 Intel Corporation. All rights reserved. */
-#include <linux/percpu-refcount.h>
-#include <linux/memremap.h>
-#include <linux/module.h>
-#include <linux/pfn_t.h>
-#include <linux/nd.h>
-#include "../bus.h"
-
-/* we need the private definitions to implement compat suport */
-#include "../dax-private.h"
-
-static int dax_pmem_compat_probe(struct device *dev)
-{
-	struct dev_dax *dev_dax = __dax_pmem_probe(dev, DEV_DAX_CLASS);
-	int rc;
-
-	if (IS_ERR(dev_dax))
-		return PTR_ERR(dev_dax);
-
-        if (!devres_open_group(&dev_dax->dev, dev_dax, GFP_KERNEL))
-		return -ENOMEM;
-
-	device_lock(&dev_dax->dev);
-	rc = dev_dax_probe(dev_dax);
-	device_unlock(&dev_dax->dev);
-
-	devres_close_group(&dev_dax->dev, dev_dax);
-	if (rc)
-		devres_release_group(&dev_dax->dev, dev_dax);
-
-	return rc;
-}
-
-static int dax_pmem_compat_release(struct device *dev, void *data)
-{
-	device_lock(dev);
-	devres_release_group(dev, to_dev_dax(dev));
-	device_unlock(dev);
-
-	return 0;
-}
-
-static void dax_pmem_compat_remove(struct device *dev)
-{
-	device_for_each_child(dev, NULL, dax_pmem_compat_release);
-}
-
-static struct nd_device_driver dax_pmem_compat_driver = {
-	.probe = dax_pmem_compat_probe,
-	.remove = dax_pmem_compat_remove,
-	.drv = {
-		.name = "dax_pmem_compat",
-	},
-	.type = ND_DRIVER_DAX_PMEM,
-};
-
-static int __init dax_pmem_compat_init(void)
-{
-	return nd_driver_register(&dax_pmem_compat_driver);
-}
-module_init(dax_pmem_compat_init);
-
-static void __exit dax_pmem_compat_exit(void)
-{
-	driver_unregister(&dax_pmem_compat_driver.drv);
-}
-module_exit(dax_pmem_compat_exit);
-
-MODULE_LICENSE("GPL v2");
-MODULE_AUTHOR("Intel Corporation");
-MODULE_ALIAS_ND_DEVICE(ND_DEVICE_DAX_PMEM);
diff --git a/drivers/dax/pmem/core.c b/drivers/dax/pmem/core.c
deleted file mode 100644
index 062e8bc14223..000000000000
--- a/drivers/dax/pmem/core.c
+++ /dev/null
@@ -1,79 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright(c) 2016 - 2018 Intel Corporation. All rights reserved. */
-#include <linux/memremap.h>
-#include <linux/module.h>
-#include <linux/pfn_t.h>
-#include "../../nvdimm/pfn.h"
-#include "../../nvdimm/nd.h"
-#include "../bus.h"
-
-struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys)
-{
-	struct range range;
-	int rc, id, region_id;
-	resource_size_t offset;
-	struct nd_pfn_sb *pfn_sb;
-	struct dev_dax *dev_dax;
-	struct dev_dax_data data;
-	struct nd_namespace_io *nsio;
-	struct dax_region *dax_region;
-	struct dev_pagemap pgmap = { };
-	struct nd_namespace_common *ndns;
-	struct nd_dax *nd_dax = to_nd_dax(dev);
-	struct nd_pfn *nd_pfn = &nd_dax->nd_pfn;
-	struct nd_region *nd_region = to_nd_region(dev->parent);
-
-	ndns = nvdimm_namespace_common_probe(dev);
-	if (IS_ERR(ndns))
-		return ERR_CAST(ndns);
-
-	/* parse the 'pfn' info block via ->rw_bytes */
-	rc = devm_namespace_enable(dev, ndns, nd_info_block_reserve());
-	if (rc)
-		return ERR_PTR(rc);
-	rc = nvdimm_setup_pfn(nd_pfn, &pgmap);
-	if (rc)
-		return ERR_PTR(rc);
-	devm_namespace_disable(dev, ndns);
-
-	/* reserve the metadata area, device-dax will reserve the data */
-	pfn_sb = nd_pfn->pfn_sb;
-	offset = le64_to_cpu(pfn_sb->dataoff);
-	nsio = to_nd_namespace_io(&ndns->dev);
-	if (!devm_request_mem_region(dev, nsio->res.start, offset,
-				dev_name(&ndns->dev))) {
-		dev_warn(dev, "could not reserve metadata\n");
-		return ERR_PTR(-EBUSY);
-	}
-
-	rc = sscanf(dev_name(&ndns->dev), "namespace%d.%d", &region_id, &id);
-	if (rc != 2)
-		return ERR_PTR(-EINVAL);
-
-	/* adjust the dax_region range to the start of data */
-	range = pgmap.range;
-	range.start += offset;
-	dax_region = alloc_dax_region(dev, region_id, &range,
-			nd_region->target_node, le32_to_cpu(pfn_sb->align),
-			IORESOURCE_DAX_STATIC);
-	if (!dax_region)
-		return ERR_PTR(-ENOMEM);
-
-	data = (struct dev_dax_data) {
-		.dax_region = dax_region,
-		.id = id,
-		.pgmap = &pgmap,
-		.subsys = subsys,
-		.size = range_len(&range),
-	};
-	dev_dax = devm_create_dev_dax(&data);
-
-	/* child dev_dax instances now own the lifetime of the dax_region */
-	dax_region_put(dax_region);
-
-	return dev_dax;
-}
-EXPORT_SYMBOL_GPL(__dax_pmem_probe);
-
-MODULE_LICENSE("GPL v2");
-MODULE_AUTHOR("Intel Corporation");
diff --git a/drivers/dax/pmem/pmem.c b/drivers/dax/pmem/pmem.c
index 0ae4238a0ef8..dfe91a2990fe 100644
--- a/drivers/dax/pmem/pmem.c
+++ b/drivers/dax/pmem/pmem.c
@@ -7,34 +7,4 @@
 #include <linux/nd.h>
 #include "../bus.h"
 
-static int dax_pmem_probe(struct device *dev)
-{
-	return PTR_ERR_OR_ZERO(__dax_pmem_probe(dev, DEV_DAX_BUS));
-}
 
-static struct nd_device_driver dax_pmem_driver = {
-	.probe = dax_pmem_probe,
-	.drv = {
-		.name = "dax_pmem",
-	},
-	.type = ND_DRIVER_DAX_PMEM,
-};
-
-static int __init dax_pmem_init(void)
-{
-	return nd_driver_register(&dax_pmem_driver);
-}
-module_init(dax_pmem_init);
-
-static void __exit dax_pmem_exit(void)
-{
-	driver_unregister(&dax_pmem_driver.drv);
-}
-module_exit(dax_pmem_exit);
-
-MODULE_LICENSE("GPL v2");
-MODULE_AUTHOR("Intel Corporation");
-#if !IS_ENABLED(CONFIG_DEV_DAX_PMEM_COMPAT)
-/* For compat builds, don't load this module by default */
-MODULE_ALIAS_ND_DEVICE(ND_DEVICE_DAX_PMEM);
-#endif
diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild
index 47f9cc9dcd94..c57d9e9d4480 100644
--- a/tools/testing/nvdimm/Kbuild
+++ b/tools/testing/nvdimm/Kbuild
@@ -35,8 +35,6 @@ obj-$(CONFIG_DAX) += dax.o
 endif
 obj-$(CONFIG_DEV_DAX) += device_dax.o
 obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o
-obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem_core.o
-obj-$(CONFIG_DEV_DAX_PMEM_COMPAT) += dax_pmem_compat.o
 
 nfit-y := $(ACPI_SRC)/core.o
 nfit-y += $(ACPI_SRC)/intel.o
@@ -67,12 +65,8 @@ device_dax-y += dax-dev.o
 device_dax-y += device_dax_test.o
 device_dax-y += config_check.o
 
-dax_pmem-y := $(DAX_SRC)/pmem/pmem.o
+dax_pmem-y := $(DAX_SRC)/pmem.o
 dax_pmem-y += dax_pmem_test.o
-dax_pmem_core-y := $(DAX_SRC)/pmem/core.o
-dax_pmem_core-y += dax_pmem_core_test.o
-dax_pmem_compat-y := $(DAX_SRC)/pmem/compat.o
-dax_pmem_compat-y += dax_pmem_compat_test.o
 dax_pmem-y += config_check.o
 
 libnvdimm-y := $(NVDIMM_SRC)/core.o
diff --git a/tools/testing/nvdimm/dax_pmem_compat_test.c b/tools/testing/nvdimm/dax_pmem_compat_test.c
deleted file mode 100644
index 7cd1877f3765..000000000000
--- a/tools/testing/nvdimm/dax_pmem_compat_test.c
+++ /dev/null
@@ -1,8 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-// Copyright(c) 2019 Intel Corporation. All rights reserved.
-
-#include <linux/module.h>
-#include <linux/printk.h>
-#include "watermark.h"
-
-nfit_test_watermark(dax_pmem_compat);
diff --git a/tools/testing/nvdimm/dax_pmem_core_test.c b/tools/testing/nvdimm/dax_pmem_core_test.c
deleted file mode 100644
index a4249cdbeec1..000000000000
--- a/tools/testing/nvdimm/dax_pmem_core_test.c
+++ /dev/null
@@ -1,8 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-// Copyright(c) 2019 Intel Corporation. All rights reserved.
-
-#include <linux/module.h>
-#include <linux/printk.h>
-#include "watermark.h"
-
-nfit_test_watermark(dax_pmem_core);
diff --git a/tools/testing/nvdimm/test/ndtest.c b/tools/testing/nvdimm/test/ndtest.c
index 6862915f1fb0..3ca7c32e9362 100644
--- a/tools/testing/nvdimm/test/ndtest.c
+++ b/tools/testing/nvdimm/test/ndtest.c
@@ -1054,10 +1054,6 @@ static __init int ndtest_init(void)
 	libnvdimm_test();
 	device_dax_test();
 	dax_pmem_test();
-	dax_pmem_core_test();
-#ifdef CONFIG_DEV_DAX_PMEM_COMPAT
-	dax_pmem_compat_test();
-#endif
 
 	nfit_test_setup(ndtest_resource_lookup, NULL);
 
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index b1bff5fb0f65..0bc91ffee257 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -3300,10 +3300,6 @@ static __init int nfit_test_init(void)
 	acpi_nfit_test();
 	device_dax_test();
 	dax_pmem_test();
-	dax_pmem_core_test();
-#ifdef CONFIG_DEV_DAX_PMEM_COMPAT
-	dax_pmem_compat_test();
-#endif
 
 	nfit_test_setup(nfit_test_lookup, nfit_test_evaluate_dsm);
 
-- 
cgit v1.2.3


From 16e0c35c6f7a2e90d52f3035ecf942af21417b7b Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 23 Nov 2021 12:01:04 -0800
Subject: libbpf: Load global data maps lazily on legacy kernels

Load global data maps lazily, if kernel is too old to support global
data. Make sure that programs are still correct by detecting if any of
the to-be-loaded programs have relocation against any of such maps.

This allows to solve the issue ([0]) with bpf_printk() and Clang
generating unnecessary and unreferenced .rodata.strX.Y sections, but it
also goes further along the CO-RE lines, allowing to have a BPF object
in which some code can work on very old kernels and relies only on BPF
maps explicitly, while other BPF programs might enjoy global variable
support. If such programs are correctly set to not load at runtime on
old kernels, bpf_object will load and function correctly now.

  [0] https://lore.kernel.org/bpf/CAK-59YFPU3qO+_pXWOH+c1LSA=8WA1yabJZfREjOEXNHAqgXNg@mail.gmail.com/

Fixes: aed659170a31 ("libbpf: Support multiple .rodata.* and .data.* BPF maps")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20211123200105.387855-1-andrii@kernel.org
---
 tools/lib/bpf/libbpf.c | 34 ++++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index af405c38aadc..27695bf31250 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -5006,6 +5006,24 @@ bpf_object__create_maps(struct bpf_object *obj)
 	for (i = 0; i < obj->nr_maps; i++) {
 		map = &obj->maps[i];
 
+		/* To support old kernels, we skip creating global data maps
+		 * (.rodata, .data, .kconfig, etc); later on, during program
+		 * loading, if we detect that at least one of the to-be-loaded
+		 * programs is referencing any global data map, we'll error
+		 * out with program name and relocation index logged.
+		 * This approach allows to accommodate Clang emitting
+		 * unnecessary .rodata.str1.1 sections for string literals,
+		 * but also it allows to have CO-RE applications that use
+		 * global variables in some of BPF programs, but not others.
+		 * If those global variable-using programs are not loaded at
+		 * runtime due to bpf_program__set_autoload(prog, false),
+		 * bpf_object loading will succeed just fine even on old
+		 * kernels.
+		 */
+		if (bpf_map__is_internal(map) &&
+		    !kernel_supports(obj, FEAT_GLOBAL_DATA))
+			continue;
+
 		retried = false;
 retry:
 		if (map->pin_path) {
@@ -5605,6 +5623,14 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog)
 				insn[0].src_reg = BPF_PSEUDO_MAP_IDX_VALUE;
 				insn[0].imm = relo->map_idx;
 			} else {
+				const struct bpf_map *map = &obj->maps[relo->map_idx];
+
+				if (bpf_map__is_internal(map) &&
+				    !kernel_supports(obj, FEAT_GLOBAL_DATA)) {
+					pr_warn("prog '%s': relo #%d: kernel doesn't support global data\n",
+						prog->name, i);
+					return -ENOTSUP;
+				}
 				insn[0].src_reg = BPF_PSEUDO_MAP_VALUE;
 				insn[0].imm = obj->maps[relo->map_idx].fd;
 			}
@@ -6139,6 +6165,8 @@ bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path)
 		 */
 		if (prog_is_subprog(obj, prog))
 			continue;
+		if (!prog->load)
+			continue;
 
 		err = bpf_object__relocate_calls(obj, prog);
 		if (err) {
@@ -6152,6 +6180,8 @@ bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path)
 		prog = &obj->programs[i];
 		if (prog_is_subprog(obj, prog))
 			continue;
+		if (!prog->load)
+			continue;
 		err = bpf_object__relocate_data(obj, prog);
 		if (err) {
 			pr_warn("prog '%s': failed to relocate data references: %d\n",
@@ -6939,10 +6969,6 @@ static int bpf_object__sanitize_maps(struct bpf_object *obj)
 	bpf_object__for_each_map(m, obj) {
 		if (!bpf_map__is_internal(m))
 			continue;
-		if (!kernel_supports(obj, FEAT_GLOBAL_DATA)) {
-			pr_warn("kernel doesn't support global data\n");
-			return -ENOTSUP;
-		}
 		if (!kernel_supports(obj, FEAT_ARRAY_MMAP))
 			m->def.map_flags ^= BPF_F_MMAPABLE;
 	}
-- 
cgit v1.2.3


From e4f7ac90c2b09766e4acf771908987391c836413 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 23 Nov 2021 12:01:05 -0800
Subject: selftests/bpf: Mix legacy (maps) and modern (vars) BPF in one test

Add selftest that combines two BPF programs within single BPF object
file such that one of the programs is using global variables, but can be
skipped at runtime on old kernels that don't support global data.
Another BPF program is written with the goal to be runnable on very old
kernels and only relies on explicitly accessed BPF maps.

Such test, run against old kernels (e.g., libbpf CI will run it against 4.9
kernel that doesn't support global data), allows to test the approach
and ensure that libbpf doesn't make unnecessary assumption about
necessary kernel features.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20211123200105.387855-2-andrii@kernel.org
---
 .../selftests/bpf/prog_tests/legacy_printk.c       | 65 +++++++++++++++++++
 .../selftests/bpf/progs/test_legacy_printk.c       | 73 ++++++++++++++++++++++
 2 files changed, 138 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/legacy_printk.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_legacy_printk.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/legacy_printk.c b/tools/testing/selftests/bpf/prog_tests/legacy_printk.c
new file mode 100644
index 000000000000..ec6e45f2a644
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/legacy_printk.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include <test_progs.h>
+#include "test_legacy_printk.skel.h"
+
+static int execute_one_variant(bool legacy)
+{
+	struct test_legacy_printk *skel;
+	int err, zero = 0, my_pid = getpid(), res, map_fd;
+
+	skel = test_legacy_printk__open();
+	if (!ASSERT_OK_PTR(skel, "skel_open"))
+		return -errno;
+
+	bpf_program__set_autoload(skel->progs.handle_legacy, legacy);
+	bpf_program__set_autoload(skel->progs.handle_modern, !legacy);
+
+	err = test_legacy_printk__load(skel);
+	/* no ASSERT_OK, we expect one of two variants can fail here */
+	if (err)
+		goto err_out;
+
+	if (legacy) {
+		map_fd = bpf_map__fd(skel->maps.my_pid_map);
+		err = bpf_map_update_elem(map_fd, &zero, &my_pid, BPF_ANY);
+		if (!ASSERT_OK(err, "my_pid_map_update"))
+			goto err_out;
+		err = bpf_map_lookup_elem(map_fd, &zero, &res);
+	} else {
+		skel->bss->my_pid_var = my_pid;
+	}
+
+	err = test_legacy_printk__attach(skel);
+	if (!ASSERT_OK(err, "skel_attach"))
+		goto err_out;
+
+	usleep(1); /* trigger */
+
+	if (legacy) {
+		map_fd = bpf_map__fd(skel->maps.res_map);
+		err = bpf_map_lookup_elem(map_fd, &zero, &res);
+		if (!ASSERT_OK(err, "res_map_lookup"))
+			goto err_out;
+	} else {
+		res = skel->bss->res_var;
+	}
+
+	if (!ASSERT_GT(res, 0, "res")) {
+		err = -EINVAL;
+		goto err_out;
+	}
+
+err_out:
+	test_legacy_printk__destroy(skel);
+	return err;
+}
+
+void test_legacy_printk(void)
+{
+	/* legacy variant should work everywhere */
+	ASSERT_OK(execute_one_variant(true /* legacy */), "legacy_case");
+
+	/* execute modern variant, can fail the load on old kernels */
+	execute_one_variant(false);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_legacy_printk.c b/tools/testing/selftests/bpf/progs/test_legacy_printk.c
new file mode 100644
index 000000000000..64c2d9ced529
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_legacy_printk.c
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+
+#include <linux/bpf.h>
+#define BPF_NO_GLOBAL_DATA
+#include <bpf/bpf_helpers.h>
+
+char LICENSE[] SEC("license") = "GPL";
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, int);
+	__type(value, int);
+	__uint(max_entries, 1);
+} my_pid_map SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, int);
+	__type(value, int);
+	__uint(max_entries, 1);
+} res_map SEC(".maps");
+
+volatile int my_pid_var = 0;
+volatile int res_var = 0;
+
+SEC("tp/raw_syscalls/sys_enter")
+int handle_legacy(void *ctx)
+{
+	int zero = 0, *my_pid, cur_pid, *my_res;
+
+	my_pid = bpf_map_lookup_elem(&my_pid_map, &zero);
+	if (!my_pid)
+		return 1;
+
+	cur_pid = bpf_get_current_pid_tgid() >> 32;
+	if (cur_pid != *my_pid)
+		return 1;
+
+	my_res = bpf_map_lookup_elem(&res_map, &zero);
+	if (!my_res)
+		return 1;
+
+	if (*my_res == 0)
+		/* use bpf_printk() in combination with BPF_NO_GLOBAL_DATA to
+		 * force .rodata.str1.1 section that previously caused
+		 * problems on old kernels due to libbpf always tried to
+		 * create a global data map for it
+		 */
+		bpf_printk("Legacy-case bpf_printk test, pid %d\n", cur_pid);
+	*my_res = 1;
+
+	return *my_res;
+}
+
+SEC("tp/raw_syscalls/sys_enter")
+int handle_modern(void *ctx)
+{
+	int zero = 0, cur_pid;
+
+	cur_pid = bpf_get_current_pid_tgid() >> 32;
+	if (cur_pid != my_pid_var)
+		return 1;
+
+	if (res_var == 0)
+		/* we need bpf_printk() to validate libbpf logic around unused
+		 * global maps and legacy kernels; see comment in handle_legacy()
+		 */
+		bpf_printk("Modern-case bpf_printk test, pid %d\n", cur_pid);
+	res_var = 1;
+
+	return res_var;
+}
-- 
cgit v1.2.3


From 992c4225419a38663d6239bc2f525b4ac0429188 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 24 Nov 2021 11:32:30 -0800
Subject: libbpf: Unify low-level map creation APIs w/ new bpf_map_create()

Mark the entire zoo of low-level map creation APIs for deprecation in
libbpf 0.7 ([0]) and introduce a new bpf_map_create() API that is
OPTS-based (and thus future-proof) and matches the BPF_MAP_CREATE
command name.

While at it, ensure that gen_loader sends map_extra field. Also remove
now unneeded btf_key_type_id/btf_value_type_id logic that libbpf is
doing anyways.

  [0] Closes: https://github.com/libbpf/libbpf/issues/282

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211124193233.3115996-2-andrii@kernel.org
---
 tools/lib/bpf/bpf.c              | 140 ++++++++++++++++-----------------------
 tools/lib/bpf/bpf.h              |  33 ++++++++-
 tools/lib/bpf/bpf_gen_internal.h |   5 +-
 tools/lib/bpf/gen_loader.c       |  46 +++++--------
 tools/lib/bpf/libbpf.c           |  33 +++++----
 tools/lib/bpf/libbpf.map         |   1 +
 tools/lib/bpf/libbpf_internal.h  |  21 ------
 7 files changed, 126 insertions(+), 153 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 94560ba31724..053c86e3d20f 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -88,146 +88,122 @@ static inline int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size, int
 	return fd;
 }
 
-int libbpf__bpf_create_map_xattr(const struct bpf_create_map_params *create_attr)
+int bpf_map_create(enum bpf_map_type map_type,
+		   const char *map_name,
+		   __u32 key_size,
+		   __u32 value_size,
+		   __u32 max_entries,
+		   const struct bpf_map_create_opts *opts)
 {
+	const size_t attr_sz = offsetofend(union bpf_attr, map_extra);
 	union bpf_attr attr;
 	int fd;
 
-	memset(&attr, '\0', sizeof(attr));
-
-	attr.map_type = create_attr->map_type;
-	attr.key_size = create_attr->key_size;
-	attr.value_size = create_attr->value_size;
-	attr.max_entries = create_attr->max_entries;
-	attr.map_flags = create_attr->map_flags;
-	if (create_attr->name)
-		memcpy(attr.map_name, create_attr->name,
-		       min(strlen(create_attr->name), BPF_OBJ_NAME_LEN - 1));
-	attr.numa_node = create_attr->numa_node;
-	attr.btf_fd = create_attr->btf_fd;
-	attr.btf_key_type_id = create_attr->btf_key_type_id;
-	attr.btf_value_type_id = create_attr->btf_value_type_id;
-	attr.map_ifindex = create_attr->map_ifindex;
-	if (attr.map_type == BPF_MAP_TYPE_STRUCT_OPS)
-		attr.btf_vmlinux_value_type_id =
-			create_attr->btf_vmlinux_value_type_id;
-	else
-		attr.inner_map_fd = create_attr->inner_map_fd;
-	attr.map_extra = create_attr->map_extra;
+	memset(&attr, 0, attr_sz);
+
+	if (!OPTS_VALID(opts, bpf_map_create_opts))
+		return libbpf_err(-EINVAL);
 
-	fd = sys_bpf_fd(BPF_MAP_CREATE, &attr, sizeof(attr));
+	attr.map_type = map_type;
+	if (map_name)
+		strncat(attr.map_name, map_name, sizeof(attr.map_name) - 1);
+	attr.key_size = key_size;
+	attr.value_size = value_size;
+	attr.max_entries = max_entries;
+
+	attr.btf_fd = OPTS_GET(opts, btf_fd, 0);
+	attr.btf_key_type_id = OPTS_GET(opts, btf_key_type_id, 0);
+	attr.btf_value_type_id = OPTS_GET(opts, btf_value_type_id, 0);
+	attr.btf_vmlinux_value_type_id = OPTS_GET(opts, btf_vmlinux_value_type_id, 0);
+
+	attr.inner_map_fd = OPTS_GET(opts, inner_map_fd, 0);
+	attr.map_flags = OPTS_GET(opts, map_flags, 0);
+	attr.map_extra = OPTS_GET(opts, map_extra, 0);
+	attr.numa_node = OPTS_GET(opts, numa_node, 0);
+	attr.map_ifindex = OPTS_GET(opts, map_ifindex, 0);
+
+	fd = sys_bpf_fd(BPF_MAP_CREATE, &attr, attr_sz);
 	return libbpf_err_errno(fd);
 }
 
 int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr)
 {
-	struct bpf_create_map_params p = {};
+	LIBBPF_OPTS(bpf_map_create_opts, p);
 
-	p.map_type = create_attr->map_type;
-	p.key_size = create_attr->key_size;
-	p.value_size = create_attr->value_size;
-	p.max_entries = create_attr->max_entries;
 	p.map_flags = create_attr->map_flags;
-	p.name = create_attr->name;
 	p.numa_node = create_attr->numa_node;
 	p.btf_fd = create_attr->btf_fd;
 	p.btf_key_type_id = create_attr->btf_key_type_id;
 	p.btf_value_type_id = create_attr->btf_value_type_id;
 	p.map_ifindex = create_attr->map_ifindex;
-	if (p.map_type == BPF_MAP_TYPE_STRUCT_OPS)
-		p.btf_vmlinux_value_type_id =
-			create_attr->btf_vmlinux_value_type_id;
+	if (create_attr->map_type == BPF_MAP_TYPE_STRUCT_OPS)
+		p.btf_vmlinux_value_type_id = create_attr->btf_vmlinux_value_type_id;
 	else
 		p.inner_map_fd = create_attr->inner_map_fd;
 
-	return libbpf__bpf_create_map_xattr(&p);
+	return bpf_map_create(create_attr->map_type, create_attr->name,
+			      create_attr->key_size, create_attr->value_size,
+			      create_attr->max_entries, &p);
 }
 
 int bpf_create_map_node(enum bpf_map_type map_type, const char *name,
 			int key_size, int value_size, int max_entries,
 			__u32 map_flags, int node)
 {
-	struct bpf_create_map_attr map_attr = {};
-
-	map_attr.name = name;
-	map_attr.map_type = map_type;
-	map_attr.map_flags = map_flags;
-	map_attr.key_size = key_size;
-	map_attr.value_size = value_size;
-	map_attr.max_entries = max_entries;
+	LIBBPF_OPTS(bpf_map_create_opts, opts);
+
+	opts.map_flags = map_flags;
 	if (node >= 0) {
-		map_attr.numa_node = node;
-		map_attr.map_flags |= BPF_F_NUMA_NODE;
+		opts.numa_node = node;
+		opts.map_flags |= BPF_F_NUMA_NODE;
 	}
 
-	return bpf_create_map_xattr(&map_attr);
+	return bpf_map_create(map_type, name, key_size, value_size, max_entries, &opts);
 }
 
 int bpf_create_map(enum bpf_map_type map_type, int key_size,
 		   int value_size, int max_entries, __u32 map_flags)
 {
-	struct bpf_create_map_attr map_attr = {};
-
-	map_attr.map_type = map_type;
-	map_attr.map_flags = map_flags;
-	map_attr.key_size = key_size;
-	map_attr.value_size = value_size;
-	map_attr.max_entries = max_entries;
+	LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = map_flags);
 
-	return bpf_create_map_xattr(&map_attr);
+	return bpf_map_create(map_type, NULL, key_size, value_size, max_entries, &opts);
 }
 
 int bpf_create_map_name(enum bpf_map_type map_type, const char *name,
 			int key_size, int value_size, int max_entries,
 			__u32 map_flags)
 {
-	struct bpf_create_map_attr map_attr = {};
+	LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = map_flags);
 
-	map_attr.name = name;
-	map_attr.map_type = map_type;
-	map_attr.map_flags = map_flags;
-	map_attr.key_size = key_size;
-	map_attr.value_size = value_size;
-	map_attr.max_entries = max_entries;
-
-	return bpf_create_map_xattr(&map_attr);
+	return bpf_map_create(map_type, name, key_size, value_size, max_entries, &opts);
 }
 
 int bpf_create_map_in_map_node(enum bpf_map_type map_type, const char *name,
 			       int key_size, int inner_map_fd, int max_entries,
 			       __u32 map_flags, int node)
 {
-	union bpf_attr attr;
-	int fd;
-
-	memset(&attr, '\0', sizeof(attr));
-
-	attr.map_type = map_type;
-	attr.key_size = key_size;
-	attr.value_size = 4;
-	attr.inner_map_fd = inner_map_fd;
-	attr.max_entries = max_entries;
-	attr.map_flags = map_flags;
-	if (name)
-		memcpy(attr.map_name, name,
-		       min(strlen(name), BPF_OBJ_NAME_LEN - 1));
+	LIBBPF_OPTS(bpf_map_create_opts, opts);
 
+	opts.inner_map_fd = inner_map_fd;
+	opts.map_flags = map_flags;
 	if (node >= 0) {
-		attr.map_flags |= BPF_F_NUMA_NODE;
-		attr.numa_node = node;
+		opts.map_flags |= BPF_F_NUMA_NODE;
+		opts.numa_node = node;
 	}
 
-	fd = sys_bpf_fd(BPF_MAP_CREATE, &attr, sizeof(attr));
-	return libbpf_err_errno(fd);
+	return bpf_map_create(map_type, name, key_size, 4, max_entries, &opts);
 }
 
 int bpf_create_map_in_map(enum bpf_map_type map_type, const char *name,
 			  int key_size, int inner_map_fd, int max_entries,
 			  __u32 map_flags)
 {
-	return bpf_create_map_in_map_node(map_type, name, key_size,
-					  inner_map_fd, max_entries, map_flags,
-					  -1);
+	LIBBPF_OPTS(bpf_map_create_opts, opts,
+		.inner_map_fd = inner_map_fd,
+		.map_flags = map_flags,
+	);
+
+	return bpf_map_create(map_type, name, key_size, 4, max_entries, &opts);
 }
 
 static void *
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 079cc81ac51e..70b6f44fc8b0 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -35,6 +35,30 @@
 extern "C" {
 #endif
 
+struct bpf_map_create_opts {
+	size_t sz; /* size of this struct for forward/backward compatibility */
+
+	__u32 btf_fd;
+	__u32 btf_key_type_id;
+	__u32 btf_value_type_id;
+	__u32 btf_vmlinux_value_type_id;
+
+	int inner_map_fd;
+	int map_flags;
+	__u64 map_extra;
+
+	int numa_node;
+	int map_ifindex;
+};
+#define bpf_map_create_opts__last_field map_ifindex
+
+LIBBPF_API int bpf_map_create(enum bpf_map_type map_type,
+			      const char *map_name,
+			      __u32 key_size,
+			      __u32 value_size,
+			      __u32 max_entries,
+			      const struct bpf_map_create_opts *opts);
+
 struct bpf_create_map_attr {
 	const char *name;
 	enum bpf_map_type map_type;
@@ -53,20 +77,25 @@ struct bpf_create_map_attr {
 	};
 };
 
-LIBBPF_API int
-bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr);
+LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_map_create() instead")
+LIBBPF_API int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr);
+LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_map_create() instead")
 LIBBPF_API int bpf_create_map_node(enum bpf_map_type map_type, const char *name,
 				   int key_size, int value_size,
 				   int max_entries, __u32 map_flags, int node);
+LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_map_create() instead")
 LIBBPF_API int bpf_create_map_name(enum bpf_map_type map_type, const char *name,
 				   int key_size, int value_size,
 				   int max_entries, __u32 map_flags);
+LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_map_create() instead")
 LIBBPF_API int bpf_create_map(enum bpf_map_type map_type, int key_size,
 			      int value_size, int max_entries, __u32 map_flags);
+LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_map_create() instead")
 LIBBPF_API int bpf_create_map_in_map_node(enum bpf_map_type map_type,
 					  const char *name, int key_size,
 					  int inner_map_fd, int max_entries,
 					  __u32 map_flags, int node);
+LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_map_create() instead")
 LIBBPF_API int bpf_create_map_in_map(enum bpf_map_type map_type,
 				     const char *name, int key_size,
 				     int inner_map_fd, int max_entries,
diff --git a/tools/lib/bpf/bpf_gen_internal.h b/tools/lib/bpf/bpf_gen_internal.h
index 75ca9fb857b2..ae7704deba30 100644
--- a/tools/lib/bpf/bpf_gen_internal.h
+++ b/tools/lib/bpf/bpf_gen_internal.h
@@ -51,7 +51,10 @@ void bpf_gen__init(struct bpf_gen *gen, int log_level);
 int bpf_gen__finish(struct bpf_gen *gen);
 void bpf_gen__free(struct bpf_gen *gen);
 void bpf_gen__load_btf(struct bpf_gen *gen, const void *raw_data, __u32 raw_size);
-void bpf_gen__map_create(struct bpf_gen *gen, struct bpf_create_map_params *map_attr, int map_idx);
+void bpf_gen__map_create(struct bpf_gen *gen,
+			 enum bpf_map_type map_type, const char *map_name,
+			 __u32 key_size, __u32 value_size, __u32 max_entries,
+			 struct bpf_map_create_opts *map_attr, int map_idx);
 void bpf_gen__prog_load(struct bpf_gen *gen,
 			enum bpf_prog_type prog_type, const char *prog_name,
 			const char *license, struct bpf_insn *insns, size_t insn_cnt,
diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c
index 7b73f97b1fa1..c7bc77f4e752 100644
--- a/tools/lib/bpf/gen_loader.c
+++ b/tools/lib/bpf/gen_loader.c
@@ -432,47 +432,33 @@ void bpf_gen__load_btf(struct bpf_gen *gen, const void *btf_raw_data,
 }
 
 void bpf_gen__map_create(struct bpf_gen *gen,
-			 struct bpf_create_map_params *map_attr, int map_idx)
+			 enum bpf_map_type map_type,
+			 const char *map_name,
+			 __u32 key_size, __u32 value_size, __u32 max_entries,
+			 struct bpf_map_create_opts *map_attr, int map_idx)
 {
-	int attr_size = offsetofend(union bpf_attr, btf_vmlinux_value_type_id);
+	int attr_size = offsetofend(union bpf_attr, map_extra);
 	bool close_inner_map_fd = false;
 	int map_create_attr, idx;
 	union bpf_attr attr;
 
 	memset(&attr, 0, attr_size);
-	attr.map_type = map_attr->map_type;
-	attr.key_size = map_attr->key_size;
-	attr.value_size = map_attr->value_size;
+	attr.map_type = map_type;
+	attr.key_size = key_size;
+	attr.value_size = value_size;
 	attr.map_flags = map_attr->map_flags;
 	attr.map_extra = map_attr->map_extra;
-	memcpy(attr.map_name, map_attr->name,
-	       min((unsigned)strlen(map_attr->name), BPF_OBJ_NAME_LEN - 1));
+	if (map_name)
+		memcpy(attr.map_name, map_name,
+		       min((unsigned)strlen(map_name), BPF_OBJ_NAME_LEN - 1));
 	attr.numa_node = map_attr->numa_node;
 	attr.map_ifindex = map_attr->map_ifindex;
-	attr.max_entries = map_attr->max_entries;
-	switch (attr.map_type) {
-	case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
-	case BPF_MAP_TYPE_CGROUP_ARRAY:
-	case BPF_MAP_TYPE_STACK_TRACE:
-	case BPF_MAP_TYPE_ARRAY_OF_MAPS:
-	case BPF_MAP_TYPE_HASH_OF_MAPS:
-	case BPF_MAP_TYPE_DEVMAP:
-	case BPF_MAP_TYPE_DEVMAP_HASH:
-	case BPF_MAP_TYPE_CPUMAP:
-	case BPF_MAP_TYPE_XSKMAP:
-	case BPF_MAP_TYPE_SOCKMAP:
-	case BPF_MAP_TYPE_SOCKHASH:
-	case BPF_MAP_TYPE_QUEUE:
-	case BPF_MAP_TYPE_STACK:
-	case BPF_MAP_TYPE_RINGBUF:
-		break;
-	default:
-		attr.btf_key_type_id = map_attr->btf_key_type_id;
-		attr.btf_value_type_id = map_attr->btf_value_type_id;
-	}
+	attr.max_entries = max_entries;
+	attr.btf_key_type_id = map_attr->btf_key_type_id;
+	attr.btf_value_type_id = map_attr->btf_value_type_id;
 
 	pr_debug("gen: map_create: %s idx %d type %d value_type_id %d\n",
-		 attr.map_name, map_idx, map_attr->map_type, attr.btf_value_type_id);
+		 attr.map_name, map_idx, map_type, attr.btf_value_type_id);
 
 	map_create_attr = add_data(gen, &attr, attr_size);
 	if (attr.btf_value_type_id)
@@ -499,7 +485,7 @@ void bpf_gen__map_create(struct bpf_gen *gen,
 	/* emit MAP_CREATE command */
 	emit_sys_bpf(gen, BPF_MAP_CREATE, map_create_attr, attr_size);
 	debug_ret(gen, "map_create %s idx %d type %d value_size %d value_btf_id %d",
-		  attr.map_name, map_idx, map_attr->map_type, attr.value_size,
+		  attr.map_name, map_idx, map_type, value_size,
 		  attr.btf_value_type_id);
 	emit_check_err(gen);
 	/* remember map_fd in the stack, if successful */
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 27695bf31250..c869a3721532 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -4839,19 +4839,16 @@ static void bpf_map__destroy(struct bpf_map *map);
 
 static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, bool is_inner)
 {
-	struct bpf_create_map_params create_attr;
+	LIBBPF_OPTS(bpf_map_create_opts, create_attr);
 	struct bpf_map_def *def = &map->def;
+	const char *map_name = NULL;
+	__u32 max_entries;
 	int err = 0;
 
-	memset(&create_attr, 0, sizeof(create_attr));
-
 	if (kernel_supports(obj, FEAT_PROG_NAME))
-		create_attr.name = map->name;
+		map_name = map->name;
 	create_attr.map_ifindex = map->map_ifindex;
-	create_attr.map_type = def->type;
 	create_attr.map_flags = def->map_flags;
-	create_attr.key_size = def->key_size;
-	create_attr.value_size = def->value_size;
 	create_attr.numa_node = map->numa_node;
 	create_attr.map_extra = map->map_extra;
 
@@ -4865,18 +4862,14 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b
 			return nr_cpus;
 		}
 		pr_debug("map '%s': setting size to %d\n", map->name, nr_cpus);
-		create_attr.max_entries = nr_cpus;
+		max_entries = nr_cpus;
 	} else {
-		create_attr.max_entries = def->max_entries;
+		max_entries = def->max_entries;
 	}
 
 	if (bpf_map__is_struct_ops(map))
-		create_attr.btf_vmlinux_value_type_id =
-			map->btf_vmlinux_value_type_id;
+		create_attr.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
 
-	create_attr.btf_fd = 0;
-	create_attr.btf_key_type_id = 0;
-	create_attr.btf_value_type_id = 0;
 	if (obj->btf && btf__fd(obj->btf) >= 0 && !bpf_map_find_btf_info(obj, map)) {
 		create_attr.btf_fd = btf__fd(obj->btf);
 		create_attr.btf_key_type_id = map->btf_key_type_id;
@@ -4922,13 +4915,17 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b
 	}
 
 	if (obj->gen_loader) {
-		bpf_gen__map_create(obj->gen_loader, &create_attr, is_inner ? -1 : map - obj->maps);
+		bpf_gen__map_create(obj->gen_loader, def->type, map_name,
+				    def->key_size, def->value_size, max_entries,
+				    &create_attr, is_inner ? -1 : map - obj->maps);
 		/* Pretend to have valid FD to pass various fd >= 0 checks.
 		 * This fd == 0 will not be used with any syscall and will be reset to -1 eventually.
 		 */
 		map->fd = 0;
 	} else {
-		map->fd = libbpf__bpf_create_map_xattr(&create_attr);
+		map->fd = bpf_map_create(def->type, map_name,
+					 def->key_size, def->value_size,
+					 max_entries, &create_attr);
 	}
 	if (map->fd < 0 && (create_attr.btf_key_type_id ||
 			    create_attr.btf_value_type_id)) {
@@ -4943,7 +4940,9 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b
 		create_attr.btf_value_type_id = 0;
 		map->btf_key_type_id = 0;
 		map->btf_value_type_id = 0;
-		map->fd = libbpf__bpf_create_map_xattr(&create_attr);
+		map->fd = bpf_map_create(def->type, map_name,
+					 def->key_size, def->value_size,
+					 max_entries, &create_attr);
 	}
 
 	err = map->fd < 0 ? -errno : 0;
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 69bc069f0a68..623002b83b2b 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -391,6 +391,7 @@ LIBBPF_0.6.0 {
 	global:
 		bpf_map__map_extra;
 		bpf_map__set_map_extra;
+		bpf_map_create;
 		bpf_object__next_map;
 		bpf_object__next_program;
 		bpf_object__prev_map;
diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index f7ac349650a1..311905d8ca70 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -278,27 +278,6 @@ int parse_cpu_mask_file(const char *fcpu, bool **mask, int *mask_sz);
 int libbpf__load_raw_btf(const char *raw_types, size_t types_len,
 			 const char *str_sec, size_t str_len);
 
-struct bpf_create_map_params {
-	const char *name;
-	enum bpf_map_type map_type;
-	__u32 map_flags;
-	__u32 key_size;
-	__u32 value_size;
-	__u32 max_entries;
-	__u32 numa_node;
-	__u32 btf_fd;
-	__u32 btf_key_type_id;
-	__u32 btf_value_type_id;
-	__u32 map_ifindex;
-	union {
-		__u32 inner_map_fd;
-		__u32 btf_vmlinux_value_type_id;
-	};
-	__u64 map_extra;
-};
-
-int libbpf__bpf_create_map_xattr(const struct bpf_create_map_params *create_attr);
-
 struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf);
 void btf_get_kernel_prefix_kind(enum bpf_attach_type attach_type,
 				const char **prefix, int *kind);
-- 
cgit v1.2.3


From a9606f405f2c8f24751b0a7326655a657a63ad60 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 24 Nov 2021 11:32:31 -0800
Subject: libbpf: Use bpf_map_create() consistently internally

Remove all the remaining uses of to-be-deprecated bpf_create_map*() APIs.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211124193233.3115996-3-andrii@kernel.org
---
 tools/lib/bpf/libbpf.c        | 30 ++++++------------------------
 tools/lib/bpf/libbpf_probes.c | 30 +++++++++++++++---------------
 tools/lib/bpf/skel_internal.h |  3 +--
 tools/lib/bpf/xsk.c           | 13 +++----------
 4 files changed, 25 insertions(+), 51 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index c869a3721532..e05dd785b347 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -4361,7 +4361,6 @@ static int probe_kern_prog_name(void)
 
 static int probe_kern_global_data(void)
 {
-	struct bpf_create_map_attr map_attr;
 	char *cp, errmsg[STRERR_BUFSIZE];
 	struct bpf_insn insns[] = {
 		BPF_LD_MAP_VALUE(BPF_REG_1, 0, 16),
@@ -4371,13 +4370,7 @@ static int probe_kern_global_data(void)
 	};
 	int ret, map, insn_cnt = ARRAY_SIZE(insns);
 
-	memset(&map_attr, 0, sizeof(map_attr));
-	map_attr.map_type = BPF_MAP_TYPE_ARRAY;
-	map_attr.key_size = sizeof(int);
-	map_attr.value_size = 32;
-	map_attr.max_entries = 1;
-
-	map = bpf_create_map_xattr(&map_attr);
+	map = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(int), 32, 1, NULL);
 	if (map < 0) {
 		ret = -errno;
 		cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg));
@@ -4507,15 +4500,11 @@ static int probe_kern_btf_type_tag(void)
 
 static int probe_kern_array_mmap(void)
 {
-	struct bpf_create_map_attr attr = {
-		.map_type = BPF_MAP_TYPE_ARRAY,
-		.map_flags = BPF_F_MMAPABLE,
-		.key_size = sizeof(int),
-		.value_size = sizeof(int),
-		.max_entries = 1,
-	};
+	LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_MMAPABLE);
+	int fd;
 
-	return probe_fd(bpf_create_map_xattr(&attr));
+	fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(int), sizeof(int), 1, &opts);
+	return probe_fd(fd);
 }
 
 static int probe_kern_exp_attach_type(void)
@@ -4554,7 +4543,6 @@ static int probe_kern_probe_read_kernel(void)
 
 static int probe_prog_bind_map(void)
 {
-	struct bpf_create_map_attr map_attr;
 	char *cp, errmsg[STRERR_BUFSIZE];
 	struct bpf_insn insns[] = {
 		BPF_MOV64_IMM(BPF_REG_0, 0),
@@ -4562,13 +4550,7 @@ static int probe_prog_bind_map(void)
 	};
 	int ret, map, prog, insn_cnt = ARRAY_SIZE(insns);
 
-	memset(&map_attr, 0, sizeof(map_attr));
-	map_attr.map_type = BPF_MAP_TYPE_ARRAY;
-	map_attr.key_size = sizeof(int);
-	map_attr.value_size = 32;
-	map_attr.max_entries = 1;
-
-	map = bpf_create_map_xattr(&map_attr);
+	map = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(int), 32, 1, NULL);
 	if (map < 0) {
 		ret = -errno;
 		cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg));
diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
index 02c401e314c7..41f2be47c2ea 100644
--- a/tools/lib/bpf/libbpf_probes.c
+++ b/tools/lib/bpf/libbpf_probes.c
@@ -201,7 +201,6 @@ bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex)
 {
 	int key_size, value_size, max_entries, map_flags;
 	__u32 btf_key_type_id = 0, btf_value_type_id = 0;
-	struct bpf_create_map_attr attr = {};
 	int fd = -1, btf_fd = -1, fd_inner;
 
 	key_size	= sizeof(__u32);
@@ -271,34 +270,35 @@ bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex)
 
 	if (map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
 	    map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
+		LIBBPF_OPTS(bpf_map_create_opts, opts);
+
 		/* TODO: probe for device, once libbpf has a function to create
 		 * map-in-map for offload
 		 */
 		if (ifindex)
 			return false;
 
-		fd_inner = bpf_create_map(BPF_MAP_TYPE_HASH,
-					  sizeof(__u32), sizeof(__u32), 1, 0);
+		fd_inner = bpf_map_create(BPF_MAP_TYPE_HASH, NULL,
+					  sizeof(__u32), sizeof(__u32), 1, NULL);
 		if (fd_inner < 0)
 			return false;
-		fd = bpf_create_map_in_map(map_type, NULL, sizeof(__u32),
-					   fd_inner, 1, 0);
+
+		opts.inner_map_fd = fd_inner;
+		fd = bpf_map_create(map_type, NULL, sizeof(__u32), sizeof(__u32), 1, &opts);
 		close(fd_inner);
 	} else {
+		LIBBPF_OPTS(bpf_map_create_opts, opts);
+
 		/* Note: No other restriction on map type probes for offload */
-		attr.map_type = map_type;
-		attr.key_size = key_size;
-		attr.value_size = value_size;
-		attr.max_entries = max_entries;
-		attr.map_flags = map_flags;
-		attr.map_ifindex = ifindex;
+		opts.map_flags = map_flags;
+		opts.map_ifindex = ifindex;
 		if (btf_fd >= 0) {
-			attr.btf_fd = btf_fd;
-			attr.btf_key_type_id = btf_key_type_id;
-			attr.btf_value_type_id = btf_value_type_id;
+			opts.btf_fd = btf_fd;
+			opts.btf_key_type_id = btf_key_type_id;
+			opts.btf_value_type_id = btf_value_type_id;
 		}
 
-		fd = bpf_create_map_xattr(&attr);
+		fd = bpf_map_create(map_type, NULL, key_size, value_size, max_entries, &opts);
 	}
 	if (fd >= 0)
 		close(fd);
diff --git a/tools/lib/bpf/skel_internal.h b/tools/lib/bpf/skel_internal.h
index 9cf66702fa8d..b206532704ce 100644
--- a/tools/lib/bpf/skel_internal.h
+++ b/tools/lib/bpf/skel_internal.h
@@ -65,8 +65,7 @@ static inline int bpf_load_and_run(struct bpf_load_and_run_opts *opts)
 	int map_fd = -1, prog_fd = -1, key = 0, err;
 	union bpf_attr attr;
 
-	map_fd = bpf_create_map_name(BPF_MAP_TYPE_ARRAY, "__loader.map", 4,
-				     opts->data_sz, 1, 0);
+	map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "__loader.map", 4, opts->data_sz, 1, NULL);
 	if (map_fd < 0) {
 		opts->errstr = "failed to create loader map";
 		err = -errno;
diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
index fdb22f5405c9..f1c29074d527 100644
--- a/tools/lib/bpf/xsk.c
+++ b/tools/lib/bpf/xsk.c
@@ -364,7 +364,6 @@ int xsk_umem__create_v0_0_2(struct xsk_umem **umem_ptr, void *umem_area,
 static enum xsk_prog get_xsk_prog(void)
 {
 	enum xsk_prog detected = XSK_PROG_FALLBACK;
-	struct bpf_create_map_attr map_attr;
 	__u32 size_out, retval, duration;
 	char data_in = 0, data_out;
 	struct bpf_insn insns[] = {
@@ -376,13 +375,7 @@ static enum xsk_prog get_xsk_prog(void)
 	};
 	int prog_fd, map_fd, ret, insn_cnt = ARRAY_SIZE(insns);
 
-	memset(&map_attr, 0, sizeof(map_attr));
-	map_attr.map_type = BPF_MAP_TYPE_XSKMAP;
-	map_attr.key_size = sizeof(int);
-	map_attr.value_size = sizeof(int);
-	map_attr.max_entries = 1;
-
-	map_fd = bpf_create_map_xattr(&map_attr);
+	map_fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, NULL, sizeof(int), sizeof(int), 1, NULL);
 	if (map_fd < 0)
 		return detected;
 
@@ -586,8 +579,8 @@ static int xsk_create_bpf_maps(struct xsk_socket *xsk)
 	if (max_queues < 0)
 		return max_queues;
 
-	fd = bpf_create_map_name(BPF_MAP_TYPE_XSKMAP, "xsks_map",
-				 sizeof(int), sizeof(int), max_queues, 0);
+	fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, "xsks_map",
+			    sizeof(int), sizeof(int), max_queues, NULL);
 	if (fd < 0)
 		return fd;
 
-- 
cgit v1.2.3


From 99a12a32fee4f740af2f36bb8f64e11c026f3389 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 24 Nov 2021 11:32:32 -0800
Subject: libbpf: Prevent deprecation warnings in xsk.c

xsk.c is using own APIs that are marked for deprecation internally.
Given xsk.c and xsk.h will be gone in libbpf 1.0, there is no reason to
do public vs internal function split just to avoid deprecation warnings.
So just add a pragma to silence deprecation warnings (until the code is
removed completely).

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211124193233.3115996-4-andrii@kernel.org
---
 tools/lib/bpf/xsk.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
index f1c29074d527..e8d94c6dd3bc 100644
--- a/tools/lib/bpf/xsk.c
+++ b/tools/lib/bpf/xsk.c
@@ -35,6 +35,11 @@
 #include "libbpf_internal.h"
 #include "xsk.h"
 
+/* entire xsk.h and xsk.c is going away in libbpf 1.0, so ignore all internal
+ * uses of deprecated APIs
+ */
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
 #ifndef SOL_XDP
  #define SOL_XDP 283
 #endif
-- 
cgit v1.2.3


From 2fe256a429cb6c0b0064563af4158470143a363c Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 24 Nov 2021 11:32:33 -0800
Subject: selftests/bpf: Migrate selftests to bpf_map_create()

Conversion is straightforward for most cases. In few cases tests are
using mutable map_flags and attribute structs, but bpf_map_create_opts
can be used in the similar fashion, so there were no problems. Just lots
of repetitive conversions.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211124193233.3115996-5-andrii@kernel.org
---
 .../selftests/bpf/map_tests/array_map_batch_ops.c  |  13 +--
 .../selftests/bpf/map_tests/htab_map_batch_ops.c   |  13 +--
 .../bpf/map_tests/lpm_trie_map_batch_ops.c         |  15 +--
 .../selftests/bpf/map_tests/sk_storage_map.c       |  50 +++++-----
 .../selftests/bpf/prog_tests/bloom_filter_map.c    |  36 +++----
 tools/testing/selftests/bpf/prog_tests/bpf_iter.c  |   8 +-
 tools/testing/selftests/bpf/prog_tests/btf.c       |  51 ++++------
 .../selftests/bpf/prog_tests/cgroup_attach_multi.c |  12 +--
 tools/testing/selftests/bpf/prog_tests/pinning.c   |   4 +-
 .../selftests/bpf/prog_tests/ringbuf_multi.c       |   4 +-
 .../selftests/bpf/prog_tests/select_reuseport.c    |  21 ++--
 .../selftests/bpf/prog_tests/sockmap_basic.c       |   4 +-
 .../selftests/bpf/prog_tests/sockmap_ktls.c        |   2 +-
 .../selftests/bpf/prog_tests/sockmap_listen.c      |   4 +-
 .../testing/selftests/bpf/prog_tests/test_bpffs.c  |   2 +-
 tools/testing/selftests/bpf/test_cgroup_storage.c  |   8 +-
 tools/testing/selftests/bpf/test_lpm_map.c         |  27 ++---
 tools/testing/selftests/bpf/test_lru_map.c         |  16 +--
 tools/testing/selftests/bpf/test_maps.c            | 110 +++++++++++----------
 tools/testing/selftests/bpf/test_tag.c             |   5 +-
 tools/testing/selftests/bpf/test_verifier.c        |  52 +++++-----
 21 files changed, 201 insertions(+), 256 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c
index f4d870da7684..78c76496b14a 100644
--- a/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c
+++ b/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c
@@ -68,13 +68,6 @@ static void map_batch_verify(int *visited, __u32 max_entries, int *keys,
 
 static void __test_map_lookup_and_update_batch(bool is_pcpu)
 {
-	struct bpf_create_map_attr xattr = {
-		.name = "array_map",
-		.map_type = is_pcpu ? BPF_MAP_TYPE_PERCPU_ARRAY :
-				      BPF_MAP_TYPE_ARRAY,
-		.key_size = sizeof(int),
-		.value_size = sizeof(__s64),
-	};
 	int map_fd, *keys, *visited;
 	__u32 count, total, total_success;
 	const __u32 max_entries = 10;
@@ -86,10 +79,10 @@ static void __test_map_lookup_and_update_batch(bool is_pcpu)
 		.flags = 0,
 	);
 
-	xattr.max_entries = max_entries;
-	map_fd = bpf_create_map_xattr(&xattr);
+	map_fd = bpf_map_create(is_pcpu ? BPF_MAP_TYPE_PERCPU_ARRAY : BPF_MAP_TYPE_ARRAY,
+				"array_map", sizeof(int), sizeof(__s64), max_entries, NULL);
 	CHECK(map_fd == -1,
-	      "bpf_create_map_xattr()", "error:%s\n", strerror(errno));
+	      "bpf_map_create()", "error:%s\n", strerror(errno));
 
 	value_size = sizeof(__s64);
 	if (is_pcpu)
diff --git a/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c
index 976bf415fbdd..f807d53fd8dd 100644
--- a/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c
+++ b/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c
@@ -83,22 +83,15 @@ void __test_map_lookup_and_delete_batch(bool is_pcpu)
 	int err, step, value_size;
 	bool nospace_err;
 	void *values;
-	struct bpf_create_map_attr xattr = {
-		.name = "hash_map",
-		.map_type = is_pcpu ? BPF_MAP_TYPE_PERCPU_HASH :
-			    BPF_MAP_TYPE_HASH,
-		.key_size = sizeof(int),
-		.value_size = sizeof(int),
-	};
 	DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts,
 		.elem_flags = 0,
 		.flags = 0,
 	);
 
-	xattr.max_entries = max_entries;
-	map_fd = bpf_create_map_xattr(&xattr);
+	map_fd = bpf_map_create(is_pcpu ? BPF_MAP_TYPE_PERCPU_HASH : BPF_MAP_TYPE_HASH,
+				"hash_map", sizeof(int), sizeof(int), max_entries, NULL);
 	CHECK(map_fd == -1,
-	      "bpf_create_map_xattr()", "error:%s\n", strerror(errno));
+	      "bpf_map_create()", "error:%s\n", strerror(errno));
 
 	value_size = is_pcpu ? sizeof(value) : sizeof(int);
 	keys = malloc(max_entries * sizeof(int));
diff --git a/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c
index 2e986e5e4cac..87d07b596e17 100644
--- a/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c
+++ b/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c
@@ -64,13 +64,7 @@ static void map_batch_verify(int *visited, __u32 max_entries,
 
 void test_lpm_trie_map_batch_ops(void)
 {
-	struct bpf_create_map_attr xattr = {
-		.name = "lpm_trie_map",
-		.map_type = BPF_MAP_TYPE_LPM_TRIE,
-		.key_size = sizeof(struct test_lpm_key),
-		.value_size = sizeof(int),
-		.map_flags = BPF_F_NO_PREALLOC,
-	};
+	LIBBPF_OPTS(bpf_map_create_opts, create_opts, .map_flags = BPF_F_NO_PREALLOC);
 	struct test_lpm_key *keys, key;
 	int map_fd, *values, *visited;
 	__u32 step, count, total, total_success;
@@ -82,9 +76,10 @@ void test_lpm_trie_map_batch_ops(void)
 		.flags = 0,
 	);
 
-	xattr.max_entries = max_entries;
-	map_fd = bpf_create_map_xattr(&xattr);
-	CHECK(map_fd == -1, "bpf_create_map_xattr()", "error:%s\n",
+	map_fd = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE, "lpm_trie_map",
+				sizeof(struct test_lpm_key), sizeof(int),
+				max_entries, &create_opts);
+	CHECK(map_fd == -1, "bpf_map_create()", "error:%s\n",
 	      strerror(errno));
 
 	keys = malloc(max_entries * sizeof(struct test_lpm_key));
diff --git a/tools/testing/selftests/bpf/map_tests/sk_storage_map.c b/tools/testing/selftests/bpf/map_tests/sk_storage_map.c
index e569edc679d8..8eea4ffeb092 100644
--- a/tools/testing/selftests/bpf/map_tests/sk_storage_map.c
+++ b/tools/testing/selftests/bpf/map_tests/sk_storage_map.c
@@ -19,16 +19,12 @@
 #include <test_btf.h>
 #include <test_maps.h>
 
-static struct bpf_create_map_attr xattr = {
-	.name = "sk_storage_map",
-	.map_type = BPF_MAP_TYPE_SK_STORAGE,
-	.map_flags = BPF_F_NO_PREALLOC,
-	.max_entries = 0,
-	.key_size = 4,
-	.value_size = 8,
+static struct bpf_map_create_opts map_opts = {
+	.sz = sizeof(map_opts),
 	.btf_key_type_id = 1,
 	.btf_value_type_id = 3,
 	.btf_fd = -1,
+	.map_flags = BPF_F_NO_PREALLOC,
 };
 
 static unsigned int nr_sk_threads_done;
@@ -150,13 +146,13 @@ static int create_sk_storage_map(void)
 	btf_fd = load_btf();
 	CHECK(btf_fd == -1, "bpf_load_btf", "btf_fd:%d errno:%d\n",
 	      btf_fd, errno);
-	xattr.btf_fd = btf_fd;
+	map_opts.btf_fd = btf_fd;
 
-	map_fd = bpf_create_map_xattr(&xattr);
-	xattr.btf_fd = -1;
+	map_fd = bpf_map_create(BPF_MAP_TYPE_SK_STORAGE, "sk_storage_map", 4, 8, 0, &map_opts);
+	map_opts.btf_fd = -1;
 	close(btf_fd);
 	CHECK(map_fd == -1,
-	      "bpf_create_map_xattr()", "errno:%d\n", errno);
+	      "bpf_map_create()", "errno:%d\n", errno);
 
 	return map_fd;
 }
@@ -463,20 +459,20 @@ static void test_sk_storage_map_basic(void)
 		int cnt;
 		int lock;
 	} value = { .cnt = 0xeB9f, .lock = 0, }, lookup_value;
-	struct bpf_create_map_attr bad_xattr;
+	struct bpf_map_create_opts bad_xattr;
 	int btf_fd, map_fd, sk_fd, err;
 
 	btf_fd = load_btf();
 	CHECK(btf_fd == -1, "bpf_load_btf", "btf_fd:%d errno:%d\n",
 	      btf_fd, errno);
-	xattr.btf_fd = btf_fd;
+	map_opts.btf_fd = btf_fd;
 
 	sk_fd = socket(AF_INET6, SOCK_STREAM, 0);
 	CHECK(sk_fd == -1, "socket()", "sk_fd:%d errno:%d\n",
 	      sk_fd, errno);
 
-	map_fd = bpf_create_map_xattr(&xattr);
-	CHECK(map_fd == -1, "bpf_create_map_xattr(good_xattr)",
+	map_fd = bpf_map_create(BPF_MAP_TYPE_SK_STORAGE, "sk_storage_map", 4, 8, 0, &map_opts);
+	CHECK(map_fd == -1, "bpf_map_create(good_xattr)",
 	      "map_fd:%d errno:%d\n", map_fd, errno);
 
 	/* Add new elem */
@@ -560,31 +556,29 @@ static void test_sk_storage_map_basic(void)
 	CHECK(!err || errno != ENOENT, "bpf_map_delete_elem()",
 	      "err:%d errno:%d\n", err, errno);
 
-	memcpy(&bad_xattr, &xattr, sizeof(xattr));
+	memcpy(&bad_xattr, &map_opts, sizeof(map_opts));
 	bad_xattr.btf_key_type_id = 0;
-	err = bpf_create_map_xattr(&bad_xattr);
-	CHECK(!err || errno != EINVAL, "bap_create_map_xattr(bad_xattr)",
+	err = bpf_map_create(BPF_MAP_TYPE_SK_STORAGE, "sk_storage_map", 4, 8, 0, &bad_xattr);
+	CHECK(!err || errno != EINVAL, "bpf_map_create(bad_xattr)",
 	      "err:%d errno:%d\n", err, errno);
 
-	memcpy(&bad_xattr, &xattr, sizeof(xattr));
+	memcpy(&bad_xattr, &map_opts, sizeof(map_opts));
 	bad_xattr.btf_key_type_id = 3;
-	err = bpf_create_map_xattr(&bad_xattr);
-	CHECK(!err || errno != EINVAL, "bap_create_map_xattr(bad_xattr)",
+	err = bpf_map_create(BPF_MAP_TYPE_SK_STORAGE, "sk_storage_map", 4, 8, 0, &bad_xattr);
+	CHECK(!err || errno != EINVAL, "bpf_map_create(bad_xattr)",
 	      "err:%d errno:%d\n", err, errno);
 
-	memcpy(&bad_xattr, &xattr, sizeof(xattr));
-	bad_xattr.max_entries = 1;
-	err = bpf_create_map_xattr(&bad_xattr);
-	CHECK(!err || errno != EINVAL, "bap_create_map_xattr(bad_xattr)",
+	err = bpf_map_create(BPF_MAP_TYPE_SK_STORAGE, "sk_storage_map", 4, 8, 1, &map_opts);
+	CHECK(!err || errno != EINVAL, "bpf_map_create(bad_xattr)",
 	      "err:%d errno:%d\n", err, errno);
 
-	memcpy(&bad_xattr, &xattr, sizeof(xattr));
+	memcpy(&bad_xattr, &map_opts, sizeof(map_opts));
 	bad_xattr.map_flags = 0;
-	err = bpf_create_map_xattr(&bad_xattr);
+	err = bpf_map_create(BPF_MAP_TYPE_SK_STORAGE, "sk_storage_map", 4, 8, 0, &bad_xattr);
 	CHECK(!err || errno != EINVAL, "bap_create_map_xattr(bad_xattr)",
 	      "err:%d errno:%d\n", err, errno);
 
-	xattr.btf_fd = -1;
+	map_opts.btf_fd = -1;
 	close(btf_fd);
 	close(map_fd);
 	close(sk_fd);
diff --git a/tools/testing/selftests/bpf/prog_tests/bloom_filter_map.c b/tools/testing/selftests/bpf/prog_tests/bloom_filter_map.c
index be73e3de6668..d2d9e965eba5 100644
--- a/tools/testing/selftests/bpf/prog_tests/bloom_filter_map.c
+++ b/tools/testing/selftests/bpf/prog_tests/bloom_filter_map.c
@@ -7,32 +7,33 @@
 
 static void test_fail_cases(void)
 {
+	LIBBPF_OPTS(bpf_map_create_opts, opts);
 	__u32 value;
 	int fd, err;
 
 	/* Invalid key size */
-	fd = bpf_create_map(BPF_MAP_TYPE_BLOOM_FILTER, 4, sizeof(value), 100, 0);
-	if (!ASSERT_LT(fd, 0, "bpf_create_map bloom filter invalid key size"))
+	fd = bpf_map_create(BPF_MAP_TYPE_BLOOM_FILTER, NULL, 4, sizeof(value), 100, NULL);
+	if (!ASSERT_LT(fd, 0, "bpf_map_create bloom filter invalid key size"))
 		close(fd);
 
 	/* Invalid value size */
-	fd = bpf_create_map(BPF_MAP_TYPE_BLOOM_FILTER, 0, 0, 100, 0);
-	if (!ASSERT_LT(fd, 0, "bpf_create_map bloom filter invalid value size 0"))
+	fd = bpf_map_create(BPF_MAP_TYPE_BLOOM_FILTER, NULL, 0, 0, 100, NULL);
+	if (!ASSERT_LT(fd, 0, "bpf_map_create bloom filter invalid value size 0"))
 		close(fd);
 
 	/* Invalid max entries size */
-	fd = bpf_create_map(BPF_MAP_TYPE_BLOOM_FILTER, 0, sizeof(value), 0, 0);
-	if (!ASSERT_LT(fd, 0, "bpf_create_map bloom filter invalid max entries size"))
+	fd = bpf_map_create(BPF_MAP_TYPE_BLOOM_FILTER, NULL, 0, sizeof(value), 0, NULL);
+	if (!ASSERT_LT(fd, 0, "bpf_map_create bloom filter invalid max entries size"))
 		close(fd);
 
 	/* Bloom filter maps do not support BPF_F_NO_PREALLOC */
-	fd = bpf_create_map(BPF_MAP_TYPE_BLOOM_FILTER, 0, sizeof(value), 100,
-			    BPF_F_NO_PREALLOC);
-	if (!ASSERT_LT(fd, 0, "bpf_create_map bloom filter invalid flags"))
+	opts.map_flags = BPF_F_NO_PREALLOC;
+	fd = bpf_map_create(BPF_MAP_TYPE_BLOOM_FILTER, NULL, 0, sizeof(value), 100, &opts);
+	if (!ASSERT_LT(fd, 0, "bpf_map_create bloom filter invalid flags"))
 		close(fd);
 
-	fd = bpf_create_map(BPF_MAP_TYPE_BLOOM_FILTER, 0, sizeof(value), 100, 0);
-	if (!ASSERT_GE(fd, 0, "bpf_create_map bloom filter"))
+	fd = bpf_map_create(BPF_MAP_TYPE_BLOOM_FILTER, NULL, 0, sizeof(value), 100, NULL);
+	if (!ASSERT_GE(fd, 0, "bpf_map_create bloom filter"))
 		return;
 
 	/* Test invalid flags */
@@ -56,13 +57,14 @@ static void test_fail_cases(void)
 
 static void test_success_cases(void)
 {
+	LIBBPF_OPTS(bpf_map_create_opts, opts);
 	char value[11];
 	int fd, err;
 
 	/* Create a map */
-	fd = bpf_create_map(BPF_MAP_TYPE_BLOOM_FILTER, 0, sizeof(value), 100,
-			    BPF_F_ZERO_SEED | BPF_F_NUMA_NODE);
-	if (!ASSERT_GE(fd, 0, "bpf_create_map bloom filter success case"))
+	opts.map_flags = BPF_F_ZERO_SEED | BPF_F_NUMA_NODE;
+	fd = bpf_map_create(BPF_MAP_TYPE_BLOOM_FILTER, NULL, 0, sizeof(value), 100, &opts);
+	if (!ASSERT_GE(fd, 0, "bpf_map_create bloom filter success case"))
 		return;
 
 	/* Add a value to the bloom filter */
@@ -100,9 +102,9 @@ static void test_inner_map(struct bloom_filter_map *skel, const __u32 *rand_vals
 	struct bpf_link *link;
 
 	/* Create a bloom filter map that will be used as the inner map */
-	inner_map_fd = bpf_create_map(BPF_MAP_TYPE_BLOOM_FILTER, 0, sizeof(*rand_vals),
-				      nr_rand_vals, 0);
-	if (!ASSERT_GE(inner_map_fd, 0, "bpf_create_map bloom filter inner map"))
+	inner_map_fd = bpf_map_create(BPF_MAP_TYPE_BLOOM_FILTER, NULL, 0, sizeof(*rand_vals),
+				      nr_rand_vals, NULL);
+	if (!ASSERT_GE(inner_map_fd, 0, "bpf_map_create bloom filter inner map"))
 		return;
 
 	for (i = 0; i < nr_rand_vals; i++) {
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
index 3e10abce3e5a..0b996be923b5 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
@@ -469,12 +469,12 @@ static void test_overflow(bool test_e2big_overflow, bool ret1)
 	 * fills seq_file buffer and then the other will trigger
 	 * overflow and needs restart.
 	 */
-	map1_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0);
-	if (CHECK(map1_fd < 0, "bpf_create_map",
+	map1_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, 4, 8, 1, NULL);
+	if (CHECK(map1_fd < 0, "bpf_map_create",
 		  "map_creation failed: %s\n", strerror(errno)))
 		goto out;
-	map2_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0);
-	if (CHECK(map2_fd < 0, "bpf_create_map",
+	map2_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, 4, 8, 1, NULL);
+	if (CHECK(map2_fd < 0, "bpf_map_create",
 		  "map_creation failed: %s\n", strerror(errno)))
 		goto free_map1;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c
index f9326a13badb..cab810bab593 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf.c
@@ -4074,7 +4074,7 @@ done:
 static void do_test_raw(unsigned int test_num)
 {
 	struct btf_raw_test *test = &raw_tests[test_num - 1];
-	struct bpf_create_map_attr create_attr = {};
+	LIBBPF_OPTS(bpf_map_create_opts, opts);
 	int map_fd = -1, btf_fd = -1;
 	unsigned int raw_btf_size;
 	struct btf_header *hdr;
@@ -4117,16 +4117,11 @@ static void do_test_raw(unsigned int test_num)
 	if (err || btf_fd < 0)
 		goto done;
 
-	create_attr.name = test->map_name;
-	create_attr.map_type = test->map_type;
-	create_attr.key_size = test->key_size;
-	create_attr.value_size = test->value_size;
-	create_attr.max_entries = test->max_entries;
-	create_attr.btf_fd = btf_fd;
-	create_attr.btf_key_type_id = test->key_type_id;
-	create_attr.btf_value_type_id = test->value_type_id;
-
-	map_fd = bpf_create_map_xattr(&create_attr);
+	opts.btf_fd = btf_fd;
+	opts.btf_key_type_id = test->key_type_id;
+	opts.btf_value_type_id = test->value_type_id;
+	map_fd = bpf_map_create(test->map_type, test->map_name,
+				test->key_size, test->value_size, test->max_entries, &opts);
 
 	err = ((map_fd < 0) != test->map_create_err);
 	CHECK(err, "map_fd:%d test->map_create_err:%u",
@@ -4290,7 +4285,7 @@ done:
 static int test_btf_id(unsigned int test_num)
 {
 	const struct btf_get_info_test *test = &get_info_tests[test_num - 1];
-	struct bpf_create_map_attr create_attr = {};
+	LIBBPF_OPTS(bpf_map_create_opts, opts);
 	uint8_t *raw_btf = NULL, *user_btf[2] = {};
 	int btf_fd[2] = {-1, -1}, map_fd = -1;
 	struct bpf_map_info map_info = {};
@@ -4355,16 +4350,11 @@ static int test_btf_id(unsigned int test_num)
 	}
 
 	/* Test btf members in struct bpf_map_info */
-	create_attr.name = "test_btf_id";
-	create_attr.map_type = BPF_MAP_TYPE_ARRAY;
-	create_attr.key_size = sizeof(int);
-	create_attr.value_size = sizeof(unsigned int);
-	create_attr.max_entries = 4;
-	create_attr.btf_fd = btf_fd[0];
-	create_attr.btf_key_type_id = 1;
-	create_attr.btf_value_type_id = 2;
-
-	map_fd = bpf_create_map_xattr(&create_attr);
+	opts.btf_fd = btf_fd[0];
+	opts.btf_key_type_id = 1;
+	opts.btf_value_type_id = 2;
+	map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "test_btf_id",
+				sizeof(int), sizeof(int), 4, &opts);
 	if (CHECK(map_fd < 0, "errno:%d", errno)) {
 		err = -1;
 		goto done;
@@ -5153,7 +5143,7 @@ static void do_test_pprint(int test_num)
 {
 	const struct btf_raw_test *test = &pprint_test_template[test_num];
 	enum pprint_mapv_kind_t mapv_kind = test->mapv_kind;
-	struct bpf_create_map_attr create_attr = {};
+	LIBBPF_OPTS(bpf_map_create_opts, opts);
 	bool ordered_map, lossless_map, percpu_map;
 	int err, ret, num_cpus, rounded_value_size;
 	unsigned int key, nr_read_elems;
@@ -5189,16 +5179,11 @@ static void do_test_pprint(int test_num)
 		goto done;
 	}
 
-	create_attr.name = test->map_name;
-	create_attr.map_type = test->map_type;
-	create_attr.key_size = test->key_size;
-	create_attr.value_size = test->value_size;
-	create_attr.max_entries = test->max_entries;
-	create_attr.btf_fd = btf_fd;
-	create_attr.btf_key_type_id = test->key_type_id;
-	create_attr.btf_value_type_id = test->value_type_id;
-
-	map_fd = bpf_create_map_xattr(&create_attr);
+	opts.btf_fd = btf_fd;
+	opts.btf_key_type_id = test->key_type_id;
+	opts.btf_value_type_id = test->value_type_id;
+	map_fd = bpf_map_create(test->map_type, test->map_name,
+				test->key_size, test->value_size, test->max_entries, &opts);
 	if (CHECK(map_fd < 0, "errno:%d", errno)) {
 		err = -1;
 		goto done;
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c
index de9c3e12b0ea..d3e8f729c623 100644
--- a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c
@@ -15,22 +15,22 @@ static int prog_load_cnt(int verdict, int val)
 	int cgroup_storage_fd, percpu_cgroup_storage_fd;
 
 	if (map_fd < 0)
-		map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0);
+		map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, 4, 8, 1, NULL);
 	if (map_fd < 0) {
 		printf("failed to create map '%s'\n", strerror(errno));
 		return -1;
 	}
 
-	cgroup_storage_fd = bpf_create_map(BPF_MAP_TYPE_CGROUP_STORAGE,
-				sizeof(struct bpf_cgroup_storage_key), 8, 0, 0);
+	cgroup_storage_fd = bpf_map_create(BPF_MAP_TYPE_CGROUP_STORAGE, NULL,
+				sizeof(struct bpf_cgroup_storage_key), 8, 0, NULL);
 	if (cgroup_storage_fd < 0) {
 		printf("failed to create map '%s'\n", strerror(errno));
 		return -1;
 	}
 
-	percpu_cgroup_storage_fd = bpf_create_map(
-		BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
-		sizeof(struct bpf_cgroup_storage_key), 8, 0, 0);
+	percpu_cgroup_storage_fd = bpf_map_create(
+		BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, NULL,
+		sizeof(struct bpf_cgroup_storage_key), 8, 0, NULL);
 	if (percpu_cgroup_storage_fd < 0) {
 		printf("failed to create map '%s'\n", strerror(errno));
 		return -1;
diff --git a/tools/testing/selftests/bpf/prog_tests/pinning.c b/tools/testing/selftests/bpf/prog_tests/pinning.c
index d4b953ae3407..31c09ba577eb 100644
--- a/tools/testing/selftests/bpf/prog_tests/pinning.c
+++ b/tools/testing/selftests/bpf/prog_tests/pinning.c
@@ -241,8 +241,8 @@ void test_pinning(void)
 		goto out;
 	}
 
-	map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(__u32),
-				sizeof(__u64), 1, 0);
+	map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(__u32),
+				sizeof(__u64), 1, NULL);
 	if (CHECK(map_fd < 0, "create pinmap manually", "fd %d\n", map_fd))
 		goto out;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c
index 167cd8a2edfd..e945195b24c9 100644
--- a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c
+++ b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c
@@ -62,8 +62,8 @@ void test_ringbuf_multi(void)
 	if (CHECK(err != 0, "bpf_map__set_max_entries", "bpf_map__set_max_entries failed\n"))
 		goto cleanup;
 
-	proto_fd = bpf_create_map(BPF_MAP_TYPE_RINGBUF, 0, 0, page_size, 0);
-	if (CHECK(proto_fd < 0, "bpf_create_map", "bpf_create_map failed\n"))
+	proto_fd = bpf_map_create(BPF_MAP_TYPE_RINGBUF, NULL, 0, 0, page_size, NULL);
+	if (CHECK(proto_fd < 0, "bpf_map_create", "bpf_map_create failed\n"))
 		goto cleanup;
 
 	err = bpf_map__set_inner_map_fd(skel->maps.ringbuf_hash, proto_fd);
diff --git a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c
index 3cfc910ab3c1..980ac0f2c0bb 100644
--- a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c
+++ b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c
@@ -66,29 +66,20 @@ static union sa46 {
 
 static int create_maps(enum bpf_map_type inner_type)
 {
-	struct bpf_create_map_attr attr = {};
+	LIBBPF_OPTS(bpf_map_create_opts, opts);
 
 	inner_map_type = inner_type;
 
 	/* Creating reuseport_array */
-	attr.name = "reuseport_array";
-	attr.map_type = inner_type;
-	attr.key_size = sizeof(__u32);
-	attr.value_size = sizeof(__u32);
-	attr.max_entries = REUSEPORT_ARRAY_SIZE;
-
-	reuseport_array = bpf_create_map_xattr(&attr);
+	reuseport_array = bpf_map_create(inner_type, "reuseport_array",
+					 sizeof(__u32), sizeof(__u32), REUSEPORT_ARRAY_SIZE, NULL);
 	RET_ERR(reuseport_array < 0, "creating reuseport_array",
 		"reuseport_array:%d errno:%d\n", reuseport_array, errno);
 
 	/* Creating outer_map */
-	attr.name = "outer_map";
-	attr.map_type = BPF_MAP_TYPE_ARRAY_OF_MAPS;
-	attr.key_size = sizeof(__u32);
-	attr.value_size = sizeof(__u32);
-	attr.max_entries = 1;
-	attr.inner_map_fd = reuseport_array;
-	outer_map = bpf_create_map_xattr(&attr);
+	opts.inner_map_fd = reuseport_array;
+	outer_map = bpf_map_create(BPF_MAP_TYPE_ARRAY_OF_MAPS, "outer_map",
+				   sizeof(__u32), sizeof(__u32), 1, &opts);
 	RET_ERR(outer_map < 0, "creating outer_map",
 		"outer_map:%d errno:%d\n", outer_map, errno);
 
diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
index 1352ec104149..85db0f4cdd95 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
@@ -91,9 +91,9 @@ static void test_sockmap_create_update_free(enum bpf_map_type map_type)
 	if (CHECK_FAIL(s < 0))
 		return;
 
-	map = bpf_create_map(map_type, sizeof(int), sizeof(int), 1, 0);
+	map = bpf_map_create(map_type, NULL, sizeof(int), sizeof(int), 1, NULL);
 	if (CHECK_FAIL(map < 0)) {
-		perror("bpf_create_map");
+		perror("bpf_cmap_create");
 		goto out;
 	}
 
diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c b/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c
index 7a0d64fdc192..af293ea1542c 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c
@@ -97,7 +97,7 @@ static void run_tests(int family, enum bpf_map_type map_type)
 	char test_name[MAX_TEST_NAME];
 	int map;
 
-	map = bpf_create_map(map_type, sizeof(int), sizeof(int), 1, 0);
+	map = bpf_map_create(map_type, NULL, sizeof(int), sizeof(int), 1, NULL);
 	if (CHECK_FAIL(map < 0)) {
 		perror("bpf_map_create");
 		return;
diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
index 2a9cb951bfd6..7e21bfab6358 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
@@ -502,8 +502,8 @@ static void test_lookup_32_bit_value(int family, int sotype, int mapfd)
 	if (s < 0)
 		return;
 
-	mapfd = bpf_create_map(BPF_MAP_TYPE_SOCKMAP, sizeof(key),
-			       sizeof(value32), 1, 0);
+	mapfd = bpf_map_create(BPF_MAP_TYPE_SOCKMAP, NULL, sizeof(key),
+			       sizeof(value32), 1, NULL);
 	if (mapfd < 0) {
 		FAIL_ERRNO("map_create");
 		goto close;
diff --git a/tools/testing/selftests/bpf/prog_tests/test_bpffs.c b/tools/testing/selftests/bpf/prog_tests/test_bpffs.c
index d29ebfeef9c5..ada95bfb9b1b 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_bpffs.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_bpffs.c
@@ -80,7 +80,7 @@ static int fn(void)
 	if (!ASSERT_OK(err, "creating " TDIR "/fs1/b"))
 		goto out;
 
-	map = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 4, 1, 0);
+	map = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, 4, 4, 1, NULL);
 	if (!ASSERT_GT(map, 0, "create_map(ARRAY)"))
 		goto out;
 	err = bpf_obj_pin(map, TDIR "/fs1/c");
diff --git a/tools/testing/selftests/bpf/test_cgroup_storage.c b/tools/testing/selftests/bpf/test_cgroup_storage.c
index a63787e7bb1a..5b8314cd77fd 100644
--- a/tools/testing/selftests/bpf/test_cgroup_storage.c
+++ b/tools/testing/selftests/bpf/test_cgroup_storage.c
@@ -51,15 +51,15 @@ int main(int argc, char **argv)
 		goto err;
 	}
 
-	map_fd = bpf_create_map(BPF_MAP_TYPE_CGROUP_STORAGE, sizeof(key),
-				sizeof(value), 0, 0);
+	map_fd = bpf_map_create(BPF_MAP_TYPE_CGROUP_STORAGE, NULL, sizeof(key),
+				sizeof(value), 0, NULL);
 	if (map_fd < 0) {
 		printf("Failed to create map: %s\n", strerror(errno));
 		goto out;
 	}
 
-	percpu_map_fd = bpf_create_map(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
-				       sizeof(key), sizeof(value), 0, 0);
+	percpu_map_fd = bpf_map_create(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, NULL,
+				       sizeof(key), sizeof(value), 0, NULL);
 	if (percpu_map_fd < 0) {
 		printf("Failed to create map: %s\n", strerror(errno));
 		goto out;
diff --git a/tools/testing/selftests/bpf/test_lpm_map.c b/tools/testing/selftests/bpf/test_lpm_map.c
index 006be3963977..baa3e3ecae82 100644
--- a/tools/testing/selftests/bpf/test_lpm_map.c
+++ b/tools/testing/selftests/bpf/test_lpm_map.c
@@ -208,6 +208,7 @@ static void test_lpm_order(void)
 
 static void test_lpm_map(int keysize)
 {
+	LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_NO_PREALLOC);
 	size_t i, j, n_matches, n_matches_after_delete, n_nodes, n_lookups;
 	struct tlpm_node *t, *list = NULL;
 	struct bpf_lpm_trie_key *key;
@@ -233,11 +234,11 @@ static void test_lpm_map(int keysize)
 	key = alloca(sizeof(*key) + keysize);
 	memset(key, 0, sizeof(*key) + keysize);
 
-	map = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE,
+	map = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE, NULL,
 			     sizeof(*key) + keysize,
 			     keysize + 1,
 			     4096,
-			     BPF_F_NO_PREALLOC);
+			     &opts);
 	assert(map >= 0);
 
 	for (i = 0; i < n_nodes; ++i) {
@@ -329,6 +330,7 @@ static void test_lpm_map(int keysize)
 
 static void test_lpm_ipaddr(void)
 {
+	LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_NO_PREALLOC);
 	struct bpf_lpm_trie_key *key_ipv4;
 	struct bpf_lpm_trie_key *key_ipv6;
 	size_t key_size_ipv4;
@@ -342,14 +344,14 @@ static void test_lpm_ipaddr(void)
 	key_ipv4 = alloca(key_size_ipv4);
 	key_ipv6 = alloca(key_size_ipv6);
 
-	map_fd_ipv4 = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE,
+	map_fd_ipv4 = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE, NULL,
 				     key_size_ipv4, sizeof(value),
-				     100, BPF_F_NO_PREALLOC);
+				     100, &opts);
 	assert(map_fd_ipv4 >= 0);
 
-	map_fd_ipv6 = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE,
+	map_fd_ipv6 = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE, NULL,
 				     key_size_ipv6, sizeof(value),
-				     100, BPF_F_NO_PREALLOC);
+				     100, &opts);
 	assert(map_fd_ipv6 >= 0);
 
 	/* Fill data some IPv4 and IPv6 address ranges */
@@ -423,6 +425,7 @@ static void test_lpm_ipaddr(void)
 
 static void test_lpm_delete(void)
 {
+	LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_NO_PREALLOC);
 	struct bpf_lpm_trie_key *key;
 	size_t key_size;
 	int map_fd;
@@ -431,9 +434,9 @@ static void test_lpm_delete(void)
 	key_size = sizeof(*key) + sizeof(__u32);
 	key = alloca(key_size);
 
-	map_fd = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE,
+	map_fd = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE, NULL,
 				key_size, sizeof(value),
-				100, BPF_F_NO_PREALLOC);
+				100, &opts);
 	assert(map_fd >= 0);
 
 	/* Add nodes:
@@ -535,6 +538,7 @@ static void test_lpm_delete(void)
 
 static void test_lpm_get_next_key(void)
 {
+	LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_NO_PREALLOC);
 	struct bpf_lpm_trie_key *key_p, *next_key_p;
 	size_t key_size;
 	__u32 value = 0;
@@ -544,8 +548,7 @@ static void test_lpm_get_next_key(void)
 	key_p = alloca(key_size);
 	next_key_p = alloca(key_size);
 
-	map_fd = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE, key_size, sizeof(value),
-				100, BPF_F_NO_PREALLOC);
+	map_fd = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE, NULL, key_size, sizeof(value), 100, &opts);
 	assert(map_fd >= 0);
 
 	/* empty tree. get_next_key should return ENOENT */
@@ -753,6 +756,7 @@ static void setup_lpm_mt_test_info(struct lpm_mt_test_info *info, int map_fd)
 
 static void test_lpm_multi_thread(void)
 {
+	LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_NO_PREALLOC);
 	struct lpm_mt_test_info info[4];
 	size_t key_size, value_size;
 	pthread_t thread_id[4];
@@ -762,8 +766,7 @@ static void test_lpm_multi_thread(void)
 	/* create a trie */
 	value_size = sizeof(__u32);
 	key_size = sizeof(struct bpf_lpm_trie_key) + value_size;
-	map_fd = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE, key_size, value_size,
-				100, BPF_F_NO_PREALLOC);
+	map_fd = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE, NULL, key_size, value_size, 100, &opts);
 
 	/* create 4 threads to test update, delete, lookup and get_next_key */
 	setup_lpm_mt_test_info(&info[0], map_fd);
diff --git a/tools/testing/selftests/bpf/test_lru_map.c b/tools/testing/selftests/bpf/test_lru_map.c
index 7f3d1d8460b4..b9f1bbbc8aba 100644
--- a/tools/testing/selftests/bpf/test_lru_map.c
+++ b/tools/testing/selftests/bpf/test_lru_map.c
@@ -28,13 +28,14 @@ static int nr_cpus;
 
 static int create_map(int map_type, int map_flags, unsigned int size)
 {
+	LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = map_flags);
 	int map_fd;
 
-	map_fd = bpf_create_map(map_type, sizeof(unsigned long long),
-				sizeof(unsigned long long), size, map_flags);
+	map_fd = bpf_map_create(map_type, NULL,  sizeof(unsigned long long),
+				sizeof(unsigned long long), size, &opts);
 
 	if (map_fd == -1)
-		perror("bpf_create_map");
+		perror("bpf_map_create");
 
 	return map_fd;
 }
@@ -42,7 +43,6 @@ static int create_map(int map_type, int map_flags, unsigned int size)
 static int bpf_map_lookup_elem_with_ref_bit(int fd, unsigned long long key,
 					    void *value)
 {
-	struct bpf_create_map_attr map;
 	struct bpf_insn insns[] = {
 		BPF_LD_MAP_VALUE(BPF_REG_9, 0, 0),
 		BPF_LD_MAP_FD(BPF_REG_1, fd),
@@ -63,13 +63,7 @@ static int bpf_map_lookup_elem_with_ref_bit(int fd, unsigned long long key,
 	int mfd, pfd, ret, zero = 0;
 	__u32 retval = 0;
 
-	memset(&map, 0, sizeof(map));
-	map.map_type = BPF_MAP_TYPE_ARRAY;
-	map.key_size = sizeof(int);
-	map.value_size = sizeof(unsigned long long);
-	map.max_entries = 1;
-
-	mfd = bpf_create_map_xattr(&map);
+	mfd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(int), sizeof(__u64), 1, NULL);
 	if (mfd < 0)
 		return -1;
 
diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index 8b31bc1a801d..f4cd658bbe00 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -33,15 +33,14 @@
 
 static int skips;
 
-static int map_flags;
+static struct bpf_map_create_opts map_opts = { .sz = sizeof(map_opts) };
 
 static void test_hashmap(unsigned int task, void *data)
 {
 	long long key, next_key, first_key, value;
 	int fd;
 
-	fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value),
-			    2, map_flags);
+	fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(key), sizeof(value), 2, &map_opts);
 	if (fd < 0) {
 		printf("Failed to create hashmap '%s'!\n", strerror(errno));
 		exit(1);
@@ -138,8 +137,7 @@ static void test_hashmap_sizes(unsigned int task, void *data)
 
 	for (i = 1; i <= 512; i <<= 1)
 		for (j = 1; j <= 1 << 18; j <<= 1) {
-			fd = bpf_create_map(BPF_MAP_TYPE_HASH, i, j,
-					    2, map_flags);
+			fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, i, j, 2, &map_opts);
 			if (fd < 0) {
 				if (errno == ENOMEM)
 					return;
@@ -160,8 +158,8 @@ static void test_hashmap_percpu(unsigned int task, void *data)
 	int expected_key_mask = 0;
 	int fd, i;
 
-	fd = bpf_create_map(BPF_MAP_TYPE_PERCPU_HASH, sizeof(key),
-			    sizeof(bpf_percpu(value, 0)), 2, map_flags);
+	fd = bpf_map_create(BPF_MAP_TYPE_PERCPU_HASH, NULL, sizeof(key),
+			    sizeof(bpf_percpu(value, 0)), 2, &map_opts);
 	if (fd < 0) {
 		printf("Failed to create hashmap '%s'!\n", strerror(errno));
 		exit(1);
@@ -272,11 +270,11 @@ static int helper_fill_hashmap(int max_entries)
 	int i, fd, ret;
 	long long key, value;
 
-	fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value),
-			    max_entries, map_flags);
+	fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(key), sizeof(value),
+			    max_entries, &map_opts);
 	CHECK(fd < 0,
 	      "failed to create hashmap",
-	      "err: %s, flags: 0x%x\n", strerror(errno), map_flags);
+	      "err: %s, flags: 0x%x\n", strerror(errno), map_opts.map_flags);
 
 	for (i = 0; i < max_entries; i++) {
 		key = i; value = key;
@@ -332,8 +330,8 @@ static void test_hashmap_zero_seed(void)
 	int i, first, second, old_flags;
 	long long key, next_first, next_second;
 
-	old_flags = map_flags;
-	map_flags |= BPF_F_ZERO_SEED;
+	old_flags = map_opts.map_flags;
+	map_opts.map_flags |= BPF_F_ZERO_SEED;
 
 	first = helper_fill_hashmap(3);
 	second = helper_fill_hashmap(3);
@@ -355,7 +353,7 @@ static void test_hashmap_zero_seed(void)
 		key = next_first;
 	}
 
-	map_flags = old_flags;
+	map_opts.map_flags = old_flags;
 	close(first);
 	close(second);
 }
@@ -365,8 +363,7 @@ static void test_arraymap(unsigned int task, void *data)
 	int key, next_key, fd;
 	long long value;
 
-	fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value),
-			    2, 0);
+	fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(key), sizeof(value), 2, NULL);
 	if (fd < 0) {
 		printf("Failed to create arraymap '%s'!\n", strerror(errno));
 		exit(1);
@@ -421,8 +418,8 @@ static void test_arraymap_percpu(unsigned int task, void *data)
 	BPF_DECLARE_PERCPU(long, values);
 	int key, next_key, fd, i;
 
-	fd = bpf_create_map(BPF_MAP_TYPE_PERCPU_ARRAY, sizeof(key),
-			    sizeof(bpf_percpu(values, 0)), 2, 0);
+	fd = bpf_map_create(BPF_MAP_TYPE_PERCPU_ARRAY, NULL, sizeof(key),
+			    sizeof(bpf_percpu(values, 0)), 2, NULL);
 	if (fd < 0) {
 		printf("Failed to create arraymap '%s'!\n", strerror(errno));
 		exit(1);
@@ -484,8 +481,8 @@ static void test_arraymap_percpu_many_keys(void)
 	unsigned int nr_keys = 2000;
 	int key, fd, i;
 
-	fd = bpf_create_map(BPF_MAP_TYPE_PERCPU_ARRAY, sizeof(key),
-			    sizeof(bpf_percpu(values, 0)), nr_keys, 0);
+	fd = bpf_map_create(BPF_MAP_TYPE_PERCPU_ARRAY, NULL, sizeof(key),
+			    sizeof(bpf_percpu(values, 0)), nr_keys, NULL);
 	if (fd < 0) {
 		printf("Failed to create per-cpu arraymap '%s'!\n",
 		       strerror(errno));
@@ -516,8 +513,7 @@ static void test_devmap(unsigned int task, void *data)
 	int fd;
 	__u32 key, value;
 
-	fd = bpf_create_map(BPF_MAP_TYPE_DEVMAP, sizeof(key), sizeof(value),
-			    2, 0);
+	fd = bpf_map_create(BPF_MAP_TYPE_DEVMAP, NULL, sizeof(key), sizeof(value), 2, NULL);
 	if (fd < 0) {
 		printf("Failed to create devmap '%s'!\n", strerror(errno));
 		exit(1);
@@ -531,8 +527,7 @@ static void test_devmap_hash(unsigned int task, void *data)
 	int fd;
 	__u32 key, value;
 
-	fd = bpf_create_map(BPF_MAP_TYPE_DEVMAP_HASH, sizeof(key), sizeof(value),
-			    2, 0);
+	fd = bpf_map_create(BPF_MAP_TYPE_DEVMAP_HASH, NULL, sizeof(key), sizeof(value), 2, NULL);
 	if (fd < 0) {
 		printf("Failed to create devmap_hash '%s'!\n", strerror(errno));
 		exit(1);
@@ -552,14 +547,12 @@ static void test_queuemap(unsigned int task, void *data)
 		vals[i] = rand();
 
 	/* Invalid key size */
-	fd = bpf_create_map(BPF_MAP_TYPE_QUEUE, 4, sizeof(val), MAP_SIZE,
-			    map_flags);
+	fd = bpf_map_create(BPF_MAP_TYPE_QUEUE, NULL, 4, sizeof(val), MAP_SIZE, &map_opts);
 	assert(fd < 0 && errno == EINVAL);
 
-	fd = bpf_create_map(BPF_MAP_TYPE_QUEUE, 0, sizeof(val), MAP_SIZE,
-			    map_flags);
+	fd = bpf_map_create(BPF_MAP_TYPE_QUEUE, NULL, 0, sizeof(val), MAP_SIZE, &map_opts);
 	/* Queue map does not support BPF_F_NO_PREALLOC */
-	if (map_flags & BPF_F_NO_PREALLOC) {
+	if (map_opts.map_flags & BPF_F_NO_PREALLOC) {
 		assert(fd < 0 && errno == EINVAL);
 		return;
 	}
@@ -610,14 +603,12 @@ static void test_stackmap(unsigned int task, void *data)
 		vals[i] = rand();
 
 	/* Invalid key size */
-	fd = bpf_create_map(BPF_MAP_TYPE_STACK, 4, sizeof(val), MAP_SIZE,
-			    map_flags);
+	fd = bpf_map_create(BPF_MAP_TYPE_STACK, NULL, 4, sizeof(val), MAP_SIZE, &map_opts);
 	assert(fd < 0 && errno == EINVAL);
 
-	fd = bpf_create_map(BPF_MAP_TYPE_STACK, 0, sizeof(val), MAP_SIZE,
-			    map_flags);
+	fd = bpf_map_create(BPF_MAP_TYPE_STACK, NULL, 0, sizeof(val), MAP_SIZE, &map_opts);
 	/* Stack map does not support BPF_F_NO_PREALLOC */
-	if (map_flags & BPF_F_NO_PREALLOC) {
+	if (map_opts.map_flags & BPF_F_NO_PREALLOC) {
 		assert(fd < 0 && errno == EINVAL);
 		return;
 	}
@@ -744,9 +735,9 @@ static void test_sockmap(unsigned int tasks, void *data)
 	}
 
 	/* Test sockmap with connected sockets */
-	fd = bpf_create_map(BPF_MAP_TYPE_SOCKMAP,
+	fd = bpf_map_create(BPF_MAP_TYPE_SOCKMAP, NULL,
 			    sizeof(key), sizeof(value),
-			    6, 0);
+			    6, NULL);
 	if (fd < 0) {
 		if (!bpf_probe_map_type(BPF_MAP_TYPE_SOCKMAP, 0)) {
 			printf("%s SKIP (unsupported map type BPF_MAP_TYPE_SOCKMAP)\n",
@@ -1168,8 +1159,7 @@ static void test_map_in_map(void)
 
 	obj = bpf_object__open(MAPINMAP_PROG);
 
-	fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(int), sizeof(int),
-			    2, 0);
+	fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(int), sizeof(int), 2, NULL);
 	if (fd < 0) {
 		printf("Failed to create hashmap '%s'!\n", strerror(errno));
 		exit(1);
@@ -1315,8 +1305,8 @@ static void test_map_large(void)
 	} key;
 	int fd, i, value;
 
-	fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value),
-			    MAP_SIZE, map_flags);
+	fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(key), sizeof(value),
+			    MAP_SIZE, &map_opts);
 	if (fd < 0) {
 		printf("Failed to create large map '%s'!\n", strerror(errno));
 		exit(1);
@@ -1469,8 +1459,8 @@ static void test_map_parallel(void)
 	int i, fd, key = 0, value = 0;
 	int data[2];
 
-	fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value),
-			    MAP_SIZE, map_flags);
+	fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(key), sizeof(value),
+			    MAP_SIZE, &map_opts);
 	if (fd < 0) {
 		printf("Failed to create map for parallel test '%s'!\n",
 		       strerror(errno));
@@ -1518,9 +1508,13 @@ static void test_map_parallel(void)
 static void test_map_rdonly(void)
 {
 	int fd, key = 0, value = 0;
+	__u32 old_flags;
 
-	fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value),
-			    MAP_SIZE, map_flags | BPF_F_RDONLY);
+	old_flags = map_opts.map_flags;
+	map_opts.map_flags |= BPF_F_RDONLY;
+	fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(key), sizeof(value),
+			    MAP_SIZE, &map_opts);
+	map_opts.map_flags = old_flags;
 	if (fd < 0) {
 		printf("Failed to create map for read only test '%s'!\n",
 		       strerror(errno));
@@ -1543,9 +1537,13 @@ static void test_map_rdonly(void)
 static void test_map_wronly_hash(void)
 {
 	int fd, key = 0, value = 0;
+	__u32 old_flags;
 
-	fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value),
-			    MAP_SIZE, map_flags | BPF_F_WRONLY);
+	old_flags = map_opts.map_flags;
+	map_opts.map_flags |= BPF_F_WRONLY;
+	fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(key), sizeof(value),
+			    MAP_SIZE, &map_opts);
+	map_opts.map_flags = old_flags;
 	if (fd < 0) {
 		printf("Failed to create map for write only test '%s'!\n",
 		       strerror(errno));
@@ -1567,13 +1565,17 @@ static void test_map_wronly_hash(void)
 static void test_map_wronly_stack_or_queue(enum bpf_map_type map_type)
 {
 	int fd, value = 0;
+	__u32 old_flags;
+
 
 	assert(map_type == BPF_MAP_TYPE_QUEUE ||
 	       map_type == BPF_MAP_TYPE_STACK);
-	fd = bpf_create_map(map_type, 0, sizeof(value), MAP_SIZE,
-			    map_flags | BPF_F_WRONLY);
+	old_flags = map_opts.map_flags;
+	map_opts.map_flags |= BPF_F_WRONLY;
+	fd = bpf_map_create(map_type, NULL, 0, sizeof(value), MAP_SIZE, &map_opts);
+	map_opts.map_flags = old_flags;
 	/* Stack/Queue maps do not support BPF_F_NO_PREALLOC */
-	if (map_flags & BPF_F_NO_PREALLOC) {
+	if (map_opts.map_flags & BPF_F_NO_PREALLOC) {
 		assert(fd < 0 && errno == EINVAL);
 		return;
 	}
@@ -1700,8 +1702,8 @@ static void test_reuseport_array(void)
 	__u32 fds_idx = 0;
 	int fd;
 
-	map_fd = bpf_create_map(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
-				sizeof(__u32), sizeof(__u64), array_size, 0);
+	map_fd = bpf_map_create(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, NULL,
+				sizeof(__u32), sizeof(__u64), array_size, NULL);
 	CHECK(map_fd < 0, "reuseport array create",
 	      "map_fd:%d, errno:%d\n", map_fd, errno);
 
@@ -1837,8 +1839,8 @@ static void test_reuseport_array(void)
 	close(map_fd);
 
 	/* Test 32 bit fd */
-	map_fd = bpf_create_map(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
-				sizeof(__u32), sizeof(__u32), array_size, 0);
+	map_fd = bpf_map_create(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, NULL,
+				sizeof(__u32), sizeof(__u32), array_size, NULL);
 	CHECK(map_fd < 0, "reuseport array create",
 	      "map_fd:%d, errno:%d\n", map_fd, errno);
 	prepare_reuseport_grp(SOCK_STREAM, map_fd, sizeof(__u32), &fd64,
@@ -1896,10 +1898,10 @@ int main(void)
 
 	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
 
-	map_flags = 0;
+	map_opts.map_flags = 0;
 	run_all_tests();
 
-	map_flags = BPF_F_NO_PREALLOC;
+	map_opts.map_flags = BPF_F_NO_PREALLOC;
 	run_all_tests();
 
 #define DEFINE_TEST(name) test_##name();
diff --git a/tools/testing/selftests/bpf/test_tag.c b/tools/testing/selftests/bpf/test_tag.c
index 5c7bea525626..0851c42ee31c 100644
--- a/tools/testing/selftests/bpf/test_tag.c
+++ b/tools/testing/selftests/bpf/test_tag.c
@@ -185,11 +185,12 @@ static void do_test(uint32_t *tests, int start_insns, int fd_map,
 
 int main(void)
 {
+	LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_NO_PREALLOC);
 	uint32_t tests = 0;
 	int i, fd_map;
 
-	fd_map = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(int),
-				sizeof(int), 1, BPF_F_NO_PREALLOC);
+	fd_map = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(int),
+				sizeof(int), 1, &opts);
 	assert(fd_map > 0);
 
 	for (i = 0; i < 5; i++) {
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index e512b715a785..222cb063ddf4 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -461,11 +461,11 @@ static int __create_map(uint32_t type, uint32_t size_key,
 			uint32_t size_value, uint32_t max_elem,
 			uint32_t extra_flags)
 {
+	LIBBPF_OPTS(bpf_map_create_opts, opts);
 	int fd;
 
-	fd = bpf_create_map(type, size_key, size_value, max_elem,
-			    (type == BPF_MAP_TYPE_HASH ?
-			     BPF_F_NO_PREALLOC : 0) | extra_flags);
+	opts.map_flags = (type == BPF_MAP_TYPE_HASH ? BPF_F_NO_PREALLOC : 0) | extra_flags;
+	fd = bpf_map_create(type, NULL, size_key, size_value, max_elem, &opts);
 	if (fd < 0) {
 		if (skip_unsupported_map(type))
 			return -1;
@@ -521,8 +521,8 @@ static int create_prog_array(enum bpf_prog_type prog_type, uint32_t max_elem,
 {
 	int mfd, p1fd, p2fd, p3fd;
 
-	mfd = bpf_create_map(BPF_MAP_TYPE_PROG_ARRAY, sizeof(int),
-			     sizeof(int), max_elem, 0);
+	mfd = bpf_map_create(BPF_MAP_TYPE_PROG_ARRAY, NULL, sizeof(int),
+			     sizeof(int), max_elem, NULL);
 	if (mfd < 0) {
 		if (skip_unsupported_map(BPF_MAP_TYPE_PROG_ARRAY))
 			return -1;
@@ -552,10 +552,11 @@ err:
 
 static int create_map_in_map(void)
 {
+	LIBBPF_OPTS(bpf_map_create_opts, opts);
 	int inner_map_fd, outer_map_fd;
 
-	inner_map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(int),
-				      sizeof(int), 1, 0);
+	inner_map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(int),
+				      sizeof(int), 1, NULL);
 	if (inner_map_fd < 0) {
 		if (skip_unsupported_map(BPF_MAP_TYPE_ARRAY))
 			return -1;
@@ -563,8 +564,9 @@ static int create_map_in_map(void)
 		return inner_map_fd;
 	}
 
-	outer_map_fd = bpf_create_map_in_map(BPF_MAP_TYPE_ARRAY_OF_MAPS, NULL,
-					     sizeof(int), inner_map_fd, 1, 0);
+	opts.inner_map_fd = inner_map_fd;
+	outer_map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY_OF_MAPS, NULL,
+				      sizeof(int), sizeof(int), 1, &opts);
 	if (outer_map_fd < 0) {
 		if (skip_unsupported_map(BPF_MAP_TYPE_ARRAY_OF_MAPS))
 			return -1;
@@ -583,8 +585,8 @@ static int create_cgroup_storage(bool percpu)
 		BPF_MAP_TYPE_CGROUP_STORAGE;
 	int fd;
 
-	fd = bpf_create_map(type, sizeof(struct bpf_cgroup_storage_key),
-			    TEST_DATA_LEN, 0, 0);
+	fd = bpf_map_create(type, NULL, sizeof(struct bpf_cgroup_storage_key),
+			    TEST_DATA_LEN, 0, NULL);
 	if (fd < 0) {
 		if (skip_unsupported_map(type))
 			return -1;
@@ -648,22 +650,17 @@ static int load_btf(void)
 
 static int create_map_spin_lock(void)
 {
-	struct bpf_create_map_attr attr = {
-		.name = "test_map",
-		.map_type = BPF_MAP_TYPE_ARRAY,
-		.key_size = 4,
-		.value_size = 8,
-		.max_entries = 1,
+	LIBBPF_OPTS(bpf_map_create_opts, opts,
 		.btf_key_type_id = 1,
 		.btf_value_type_id = 3,
-	};
+	);
 	int fd, btf_fd;
 
 	btf_fd = load_btf();
 	if (btf_fd < 0)
 		return -1;
-	attr.btf_fd = btf_fd;
-	fd = bpf_create_map_xattr(&attr);
+	opts.btf_fd = btf_fd;
+	fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "test_map", 4, 8, 1, &opts);
 	if (fd < 0)
 		printf("Failed to create map with spin_lock\n");
 	return fd;
@@ -671,24 +668,19 @@ static int create_map_spin_lock(void)
 
 static int create_sk_storage_map(void)
 {
-	struct bpf_create_map_attr attr = {
-		.name = "test_map",
-		.map_type = BPF_MAP_TYPE_SK_STORAGE,
-		.key_size = 4,
-		.value_size = 8,
-		.max_entries = 0,
+	LIBBPF_OPTS(bpf_map_create_opts, opts,
 		.map_flags = BPF_F_NO_PREALLOC,
 		.btf_key_type_id = 1,
 		.btf_value_type_id = 3,
-	};
+	);
 	int fd, btf_fd;
 
 	btf_fd = load_btf();
 	if (btf_fd < 0)
 		return -1;
-	attr.btf_fd = btf_fd;
-	fd = bpf_create_map_xattr(&attr);
-	close(attr.btf_fd);
+	opts.btf_fd = btf_fd;
+	fd = bpf_map_create(BPF_MAP_TYPE_SK_STORAGE, "test_map", 4, 8, 0, &opts);
+	close(opts.btf_fd);
 	if (fd < 0)
 		printf("Failed to create sk_storage_map\n");
 	return fd;
-- 
cgit v1.2.3


From 1144ab9bdf3430e1b5b3f22741e5283841951add Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 23 Nov 2021 16:23:13 -0800
Subject: tools/resolve_btf_ids: Close ELF file on error

Fix one case where we don't do explicit clean up.

Fixes: fbbb68de80a4 ("bpf: Add resolve_btfids tool to resolve BTF IDs in ELF object")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211124002325.1737739-2-andrii@kernel.org
---
 tools/bpf/resolve_btfids/main.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c
index a59cb0ee609c..e9e6166c3f28 100644
--- a/tools/bpf/resolve_btfids/main.c
+++ b/tools/bpf/resolve_btfids/main.c
@@ -167,7 +167,7 @@ static struct btf_id *btf_id__find(struct rb_root *root, const char *name)
 	return NULL;
 }
 
-static struct btf_id*
+static struct btf_id *
 btf_id__add(struct rb_root *root, char *name, bool unique)
 {
 	struct rb_node **p = &root->rb_node;
@@ -730,7 +730,8 @@ int main(int argc, const char **argv)
 	if (obj.efile.idlist_shndx == -1 ||
 	    obj.efile.symbols_shndx == -1) {
 		pr_debug("Cannot find .BTF_ids or symbols sections, nothing to do\n");
-		return 0;
+		err = 0;
+		goto out;
 	}
 
 	if (symbols_collect(&obj))
-- 
cgit v1.2.3


From 401891a9debaf0a684502f2aaecf53448cee9414 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 23 Nov 2021 16:23:14 -0800
Subject: libbpf: Fix potential misaligned memory access in btf_ext__new()

Perform a memory copy before we do the sanity checks of btf_ext_hdr.
This prevents misaligned memory access if raw btf_ext data is not 4-byte
aligned ([0]).

While at it, also add missing const qualifier.

  [0] Closes: https://github.com/libbpf/libbpf/issues/391

Fixes: 2993e0515bb4 ("tools/bpf: add support to read .BTF.ext sections")
Reported-by: Evgeny Vereshchagin <evvers@ya.ru>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211124002325.1737739-3-andrii@kernel.org
---
 tools/lib/bpf/btf.c | 10 +++++-----
 tools/lib/bpf/btf.h |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index e97217a77196..8024fe355ca8 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -2731,15 +2731,11 @@ void btf_ext__free(struct btf_ext *btf_ext)
 	free(btf_ext);
 }
 
-struct btf_ext *btf_ext__new(__u8 *data, __u32 size)
+struct btf_ext *btf_ext__new(const __u8 *data, __u32 size)
 {
 	struct btf_ext *btf_ext;
 	int err;
 
-	err = btf_ext_parse_hdr(data, size);
-	if (err)
-		return libbpf_err_ptr(err);
-
 	btf_ext = calloc(1, sizeof(struct btf_ext));
 	if (!btf_ext)
 		return libbpf_err_ptr(-ENOMEM);
@@ -2752,6 +2748,10 @@ struct btf_ext *btf_ext__new(__u8 *data, __u32 size)
 	}
 	memcpy(btf_ext->data, data, size);
 
+	err = btf_ext_parse_hdr(btf_ext->data, size);
+	if (err)
+		goto done;
+
 	if (btf_ext->hdr->hdr_len < offsetofend(struct btf_ext_header, line_info_len)) {
 		err = -EINVAL;
 		goto done;
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index 5c73a5b0a044..742a2bf71c5e 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -157,7 +157,7 @@ LIBBPF_API int btf__get_map_kv_tids(const struct btf *btf, const char *map_name,
 				    __u32 expected_value_size,
 				    __u32 *key_type_id, __u32 *value_type_id);
 
-LIBBPF_API struct btf_ext *btf_ext__new(__u8 *data, __u32 size);
+LIBBPF_API struct btf_ext *btf_ext__new(const __u8 *data, __u32 size);
 LIBBPF_API void btf_ext__free(struct btf_ext *btf_ext);
 LIBBPF_API const void *btf_ext__get_raw_data(const struct btf_ext *btf_ext,
 					     __u32 *size);
-- 
cgit v1.2.3


From 2a6a9bf26170b4e156c18706cd230934ebd2f95f Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 23 Nov 2021 16:23:16 -0800
Subject: libbpf: Don't call libc APIs with NULL pointers

Sanitizer complains about qsort(), bsearch(), and memcpy() being called
with NULL pointer. This can only happen when the associated number of
elements is zero, so no harm should be done. But still prevent this from
happening to keep sanitizer runs clean from extra noise.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211124002325.1737739-5-andrii@kernel.org
---
 tools/lib/bpf/libbpf.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index e05dd785b347..672671879b21 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -3369,7 +3369,8 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
 
 	/* sort BPF programs by section name and in-section instruction offset
 	 * for faster search */
-	qsort(obj->programs, obj->nr_programs, sizeof(*obj->programs), cmp_progs);
+	if (obj->nr_programs)
+		qsort(obj->programs, obj->nr_programs, sizeof(*obj->programs), cmp_progs);
 
 	return bpf_object__init_btf(obj, btf_data, btf_ext_data);
 }
@@ -5823,6 +5824,8 @@ static int cmp_relo_by_insn_idx(const void *key, const void *elem)
 
 static struct reloc_desc *find_prog_insn_relo(const struct bpf_program *prog, size_t insn_idx)
 {
+	if (!prog->nr_reloc)
+		return NULL;
 	return bsearch(&insn_idx, prog->reloc_desc, prog->nr_reloc,
 		       sizeof(*prog->reloc_desc), cmp_relo_by_insn_idx);
 }
@@ -5838,8 +5841,9 @@ static int append_subprog_relos(struct bpf_program *main_prog, struct bpf_progra
 	relos = libbpf_reallocarray(main_prog->reloc_desc, new_cnt, sizeof(*relos));
 	if (!relos)
 		return -ENOMEM;
-	memcpy(relos + main_prog->nr_reloc, subprog->reloc_desc,
-	       sizeof(*relos) * subprog->nr_reloc);
+	if (subprog->nr_reloc)
+		memcpy(relos + main_prog->nr_reloc, subprog->reloc_desc,
+		       sizeof(*relos) * subprog->nr_reloc);
 
 	for (i = main_prog->nr_reloc; i < new_cnt; i++)
 		relos[i].insn_idx += subprog->sub_insn_off;
-- 
cgit v1.2.3


From 8cb125566c40b7141d8842c534f0ea5820ee3d5c Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 23 Nov 2021 16:23:17 -0800
Subject: libbpf: Fix glob_syms memory leak in bpf_linker

glob_syms array wasn't freed on bpf_link__free(). Fix that.

Fixes: a46349227cd8 ("libbpf: Add linker extern resolution support for functions and global variables")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211124002325.1737739-6-andrii@kernel.org
---
 tools/lib/bpf/linker.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c
index 594b206fa674..3e1b2a15fdc7 100644
--- a/tools/lib/bpf/linker.c
+++ b/tools/lib/bpf/linker.c
@@ -210,6 +210,7 @@ void bpf_linker__free(struct bpf_linker *linker)
 	}
 	free(linker->secs);
 
+	free(linker->glob_syms);
 	free(linker);
 }
 
-- 
cgit v1.2.3


From 593835377f24ca1bb98008ec1dc3baefe491ad6e Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 23 Nov 2021 16:23:18 -0800
Subject: libbpf: Fix using invalidated memory in bpf_linker

add_dst_sec() can invalidate bpf_linker's section index making
dst_symtab pointer pointing into unallocated memory. Reinitialize
dst_symtab pointer on each iteration to make sure it's always valid.

Fixes: faf6ed321cf6 ("libbpf: Add BPF static linker APIs")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211124002325.1737739-7-andrii@kernel.org
---
 tools/lib/bpf/linker.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c
index 3e1b2a15fdc7..9aa016fb55aa 100644
--- a/tools/lib/bpf/linker.c
+++ b/tools/lib/bpf/linker.c
@@ -2000,7 +2000,7 @@ add_sym:
 static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *obj)
 {
 	struct src_sec *src_symtab = &obj->secs[obj->symtab_sec_idx];
-	struct dst_sec *dst_symtab = &linker->secs[linker->symtab_sec_idx];
+	struct dst_sec *dst_symtab;
 	int i, err;
 
 	for (i = 1; i < obj->sec_cnt; i++) {
@@ -2033,6 +2033,9 @@ static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *ob
 			return -1;
 		}
 
+		/* add_dst_sec() above could have invalidated linker->secs */
+		dst_symtab = &linker->secs[linker->symtab_sec_idx];
+
 		/* shdr->sh_link points to SYMTAB */
 		dst_sec->shdr->sh_link = linker->symtab_sec_idx;
 
-- 
cgit v1.2.3


From 486e648cb2f170702fc05f777c7b6b3d8ec662ce Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 23 Nov 2021 16:23:19 -0800
Subject: selftests/bpf: Fix UBSan complaint about signed __int128 overflow

Test is using __int128 variable as unsigned and highest order bit can be
set to 1 after bit shift. Use unsigned __int128 explicitly and prevent
UBSan from complaining.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211124002325.1737739-8-andrii@kernel.org
---
 tools/testing/selftests/bpf/prog_tests/btf_dump.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
index af47aeb211e7..9e26903f9170 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
@@ -323,7 +323,7 @@ static void test_btf_dump_int_data(struct btf *btf, struct btf_dump *d,
 				   char *str)
 {
 #ifdef __SIZEOF_INT128__
-	__int128 i = 0xffffffffffffffff;
+	unsigned __int128 i = 0xffffffffffffffff;
 
 	/* this dance is required because we cannot directly initialize
 	 * a 128-bit value to anything larger than a 64-bit value.
-- 
cgit v1.2.3


From 3bd0233f388e061c44d36a1ac614a3bb4a851b7e Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 23 Nov 2021 16:23:20 -0800
Subject: selftests/bpf: Fix possible NULL passed to memcpy() with zero size

Prevent sanitizer from complaining about passing NULL into memcpy(),
even if it happens with zero size.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211124002325.1737739-9-andrii@kernel.org
---
 tools/testing/selftests/bpf/prog_tests/core_reloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c
index 1041d0c593f6..44a9868c70ea 100644
--- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c
+++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c
@@ -881,7 +881,8 @@ void test_core_reloc(void)
 		data = mmap_data;
 
 		memset(mmap_data, 0, sizeof(*data));
-		memcpy(data->in, test_case->input, test_case->input_len);
+		if (test_case->input_len)
+			memcpy(data->in, test_case->input, test_case->input_len);
 		data->my_pid_tgid = my_pid_tgid;
 
 		link = bpf_program__attach_raw_tracepoint(prog, tp_name);
-- 
cgit v1.2.3


From 6c4dedb7550aafd094f7d803668fd039545f4e57 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 23 Nov 2021 16:23:21 -0800
Subject: selftests/bpf: Prevent misaligned memory access in get_stack_raw_tp
 test

Perfbuf doesn't guarantee 8-byte alignment of the data like BPF ringbuf
does, so struct get_stack_trace_t can arrive not properly aligned for
subsequent u64 accesses. Easiest fix is to just copy data locally.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211124002325.1737739-10-andrii@kernel.org
---
 tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c
index 4184c399d4c6..977ab433a946 100644
--- a/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c
+++ b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c
@@ -24,13 +24,19 @@ static void get_stack_print_output(void *ctx, int cpu, void *data, __u32 size)
 {
 	bool good_kern_stack = false, good_user_stack = false;
 	const char *nonjit_func = "___bpf_prog_run";
-	struct get_stack_trace_t *e = data;
+	/* perfbuf-submitted data is 4-byte aligned, but we need 8-byte
+	 * alignment, so copy data into a local variable, for simplicity
+	 */
+	struct get_stack_trace_t e;
 	int i, num_stack;
 	static __u64 cnt;
 	struct ksym *ks;
 
 	cnt++;
 
+	memset(&e, 0, sizeof(e));
+	memcpy(&e, data, size <= sizeof(e) ? size : sizeof(e));
+
 	if (size < sizeof(struct get_stack_trace_t)) {
 		__u64 *raw_data = data;
 		bool found = false;
@@ -57,19 +63,19 @@ static void get_stack_print_output(void *ctx, int cpu, void *data, __u32 size)
 			good_user_stack = true;
 		}
 	} else {
-		num_stack = e->kern_stack_size / sizeof(__u64);
+		num_stack = e.kern_stack_size / sizeof(__u64);
 		if (env.jit_enabled) {
 			good_kern_stack = num_stack > 0;
 		} else {
 			for (i = 0; i < num_stack; i++) {
-				ks = ksym_search(e->kern_stack[i]);
+				ks = ksym_search(e.kern_stack[i]);
 				if (ks && (strcmp(ks->name, nonjit_func) == 0)) {
 					good_kern_stack = true;
 					break;
 				}
 			}
 		}
-		if (e->user_stack_size > 0 && e->user_stack_buildid_size > 0)
+		if (e.user_stack_size > 0 && e.user_stack_buildid_size > 0)
 			good_user_stack = true;
 	}
 
-- 
cgit v1.2.3


From e2e0d90c550a2588ebed7aa2753adaac0f633989 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 23 Nov 2021 16:23:22 -0800
Subject: selftests/bpf: Fix misaligned memory access in queue_stack_map test

Copy over iphdr into a local variable before accessing its fields.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211124002325.1737739-11-andrii@kernel.org
---
 tools/testing/selftests/bpf/prog_tests/queue_stack_map.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c b/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c
index 8ccba3ab70ee..b9822f914eeb 100644
--- a/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c
+++ b/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c
@@ -14,7 +14,7 @@ static void test_queue_stack_map_by_type(int type)
 	int i, err, prog_fd, map_in_fd, map_out_fd;
 	char file[32], buf[128];
 	struct bpf_object *obj;
-	struct iphdr *iph = (void *)buf + sizeof(struct ethhdr);
+	struct iphdr iph;
 
 	/* Fill test values to be used */
 	for (i = 0; i < MAP_SIZE; i++)
@@ -60,15 +60,17 @@ static void test_queue_stack_map_by_type(int type)
 
 		err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
 					buf, &size, &retval, &duration);
-		if (err || retval || size != sizeof(pkt_v4) ||
-		    iph->daddr != val)
+		if (err || retval || size != sizeof(pkt_v4))
+			break;
+		memcpy(&iph, buf + sizeof(struct ethhdr), sizeof(iph));
+		if (iph.daddr != val)
 			break;
 	}
 
-	CHECK(err || retval || size != sizeof(pkt_v4) || iph->daddr != val,
+	CHECK(err || retval || size != sizeof(pkt_v4) || iph.daddr != val,
 	      "bpf_map_pop_elem",
 	      "err %d errno %d retval %d size %d iph->daddr %u\n",
-	      err, errno, retval, size, iph->daddr);
+	      err, errno, retval, size, iph.daddr);
 
 	/* Queue is empty, program should return TC_ACT_SHOT */
 	err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
-- 
cgit v1.2.3


From 57428298b5acf2ba2dd98359c532774f6eaeecb3 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 23 Nov 2021 16:23:23 -0800
Subject: selftests/bpf: Prevent out-of-bounds stack access in test_bpffs

Buf can be not zero-terminated leading to strstr() to access data beyond
the intended buf[] array. Fix by forcing zero termination.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211124002325.1737739-12-andrii@kernel.org
---
 tools/testing/selftests/bpf/prog_tests/test_bpffs.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/test_bpffs.c b/tools/testing/selftests/bpf/prog_tests/test_bpffs.c
index ada95bfb9b1b..214d9f4a94a5 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_bpffs.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_bpffs.c
@@ -19,11 +19,13 @@ static int read_iter(char *file)
 	fd = open(file, 0);
 	if (fd < 0)
 		return -1;
-	while ((len = read(fd, buf, sizeof(buf))) > 0)
+	while ((len = read(fd, buf, sizeof(buf))) > 0) {
+		buf[sizeof(buf) - 1] = '\0';
 		if (strstr(buf, "iter")) {
 			close(fd);
 			return 0;
 		}
+	}
 	close(fd);
 	return -1;
 }
-- 
cgit v1.2.3


From 43080b7106db5bcdb4f09c2648e968151e1461b7 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 23 Nov 2021 16:23:24 -0800
Subject: selftests/bpf: Fix misaligned memory accesses in xdp_bonding test

Construct packet buffer explicitly for each packet to avoid unaligned
memory accesses.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211124002325.1737739-13-andrii@kernel.org
---
 .../testing/selftests/bpf/prog_tests/xdp_bonding.c | 36 ++++++++++++----------
 1 file changed, 20 insertions(+), 16 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_bonding.c b/tools/testing/selftests/bpf/prog_tests/xdp_bonding.c
index faa22b84f2ee..5e3a26b15ec6 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_bonding.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_bonding.c
@@ -218,9 +218,9 @@ static int send_udp_packets(int vary_dst_ip)
 		.h_dest = BOND2_MAC,
 		.h_proto = htons(ETH_P_IP),
 	};
-	uint8_t buf[128] = {};
-	struct iphdr *iph = (struct iphdr *)(buf + sizeof(eh));
-	struct udphdr *uh = (struct udphdr *)(buf + sizeof(eh) + sizeof(*iph));
+	struct iphdr iph = {};
+	struct udphdr uh = {};
+	uint8_t buf[128];
 	int i, s = -1;
 	int ifindex;
 
@@ -232,17 +232,16 @@ static int send_udp_packets(int vary_dst_ip)
 	if (!ASSERT_GT(ifindex, 0, "get bond1 ifindex"))
 		goto err;
 
-	memcpy(buf, &eh, sizeof(eh));
-	iph->ihl = 5;
-	iph->version = 4;
-	iph->tos = 16;
-	iph->id = 1;
-	iph->ttl = 64;
-	iph->protocol = IPPROTO_UDP;
-	iph->saddr = 1;
-	iph->daddr = 2;
-	iph->tot_len = htons(sizeof(buf) - ETH_HLEN);
-	iph->check = 0;
+	iph.ihl = 5;
+	iph.version = 4;
+	iph.tos = 16;
+	iph.id = 1;
+	iph.ttl = 64;
+	iph.protocol = IPPROTO_UDP;
+	iph.saddr = 1;
+	iph.daddr = 2;
+	iph.tot_len = htons(sizeof(buf) - ETH_HLEN);
+	iph.check = 0;
 
 	for (i = 1; i <= NPACKETS; i++) {
 		int n;
@@ -253,10 +252,15 @@ static int send_udp_packets(int vary_dst_ip)
 		};
 
 		/* vary the UDP destination port for even distribution with roundrobin/xor modes */
-		uh->dest++;
+		uh.dest++;
 
 		if (vary_dst_ip)
-			iph->daddr++;
+			iph.daddr++;
+
+		/* construct a packet */
+		memcpy(buf, &eh, sizeof(eh));
+		memcpy(buf + sizeof(eh), &iph, sizeof(iph));
+		memcpy(buf + sizeof(eh) + sizeof(iph), &uh, sizeof(uh));
 
 		n = sendto(s, buf, sizeof(buf), 0, (struct sockaddr *)&saddr_ll, sizeof(saddr_ll));
 		if (!ASSERT_EQ(n, sizeof(buf), "sendto"))
-- 
cgit v1.2.3


From 8f6f41f39348f25db843f2fcb2f1c166b4bfa2d7 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 23 Nov 2021 16:23:25 -0800
Subject: selftests/bpf: Fix misaligned accesses in xdp and xdp_bpf2bpf tests

Similar to previous patch, just copy over necessary struct into local
stack variable before checking its fields.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211124002325.1737739-14-andrii@kernel.org
---
 tools/testing/selftests/bpf/prog_tests/xdp.c         | 11 ++++++-----
 tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c |  6 +++---
 2 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/xdp.c b/tools/testing/selftests/bpf/prog_tests/xdp.c
index 7a7ef9d4e151..ac65456b7ab8 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp.c
@@ -11,8 +11,8 @@ void test_xdp(void)
 	const char *file = "./test_xdp.o";
 	struct bpf_object *obj;
 	char buf[128];
-	struct ipv6hdr *iph6 = (void *)buf + sizeof(struct ethhdr);
-	struct iphdr *iph = (void *)buf + sizeof(struct ethhdr);
+	struct ipv6hdr iph6;
+	struct iphdr iph;
 	__u32 duration, retval, size;
 	int err, prog_fd, map_fd;
 
@@ -28,16 +28,17 @@ void test_xdp(void)
 
 	err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
 				buf, &size, &retval, &duration);
-
+	memcpy(&iph, buf + sizeof(struct ethhdr), sizeof(iph));
 	CHECK(err || retval != XDP_TX || size != 74 ||
-	      iph->protocol != IPPROTO_IPIP, "ipv4",
+	      iph.protocol != IPPROTO_IPIP, "ipv4",
 	      "err %d errno %d retval %d size %d\n",
 	      err, errno, retval, size);
 
 	err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6),
 				buf, &size, &retval, &duration);
+	memcpy(&iph6, buf + sizeof(struct ethhdr), sizeof(iph6));
 	CHECK(err || retval != XDP_TX || size != 114 ||
-	      iph6->nexthdr != IPPROTO_IPV6, "ipv6",
+	      iph6.nexthdr != IPPROTO_IPV6, "ipv6",
 	      "err %d errno %d retval %d size %d\n",
 	      err, errno, retval, size);
 out:
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c
index f99386d1dc4c..c98a897ad692 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c
@@ -42,7 +42,7 @@ void test_xdp_bpf2bpf(void)
 	char buf[128];
 	int err, pkt_fd, map_fd;
 	bool passed = false;
-	struct iphdr *iph = (void *)buf + sizeof(struct ethhdr);
+	struct iphdr iph;
 	struct iptnl_info value4 = {.family = AF_INET};
 	struct test_xdp *pkt_skel = NULL;
 	struct test_xdp_bpf2bpf *ftrace_skel = NULL;
@@ -93,9 +93,9 @@ void test_xdp_bpf2bpf(void)
 	/* Run test program */
 	err = bpf_prog_test_run(pkt_fd, 1, &pkt_v4, sizeof(pkt_v4),
 				buf, &size, &retval, &duration);
-
+	memcpy(&iph, buf + sizeof(struct ethhdr), sizeof(iph));
 	if (CHECK(err || retval != XDP_TX || size != 74 ||
-		  iph->protocol != IPPROTO_IPIP, "ipv4",
+		  iph.protocol != IPPROTO_IPIP, "ipv4",
 		  "err %d errno %d retval %d size %d\n",
 		  err, errno, retval, size))
 		goto out;
-- 
cgit v1.2.3


From a125f91fe783a79c9d990fe6e28a3da8d9a0857f Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 24 Nov 2021 15:25:49 -0800
Subject: selftests: tls: add helper for creating sock pairs

We have the same code 3 times, about to add a fourth copy.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/tls.c | 128 +++++++++-----------------------------
 1 file changed, 29 insertions(+), 99 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c
index e61fc4c32ba2..8fb7cf8c4bfb 100644
--- a/tools/testing/selftests/net/tls.c
+++ b/tools/testing/selftests/net/tls.c
@@ -78,26 +78,21 @@ static void memrnd(void *s, size_t n)
 		*byte++ = rand();
 }
 
-FIXTURE(tls_basic)
-{
-	int fd, cfd;
-	bool notls;
-};
-
-FIXTURE_SETUP(tls_basic)
+static void ulp_sock_pair(struct __test_metadata *_metadata,
+			  int *fd, int *cfd, bool *notls)
 {
 	struct sockaddr_in addr;
 	socklen_t len;
 	int sfd, ret;
 
-	self->notls = false;
+	*notls = false;
 	len = sizeof(addr);
 
 	addr.sin_family = AF_INET;
 	addr.sin_addr.s_addr = htonl(INADDR_ANY);
 	addr.sin_port = 0;
 
-	self->fd = socket(AF_INET, SOCK_STREAM, 0);
+	*fd = socket(AF_INET, SOCK_STREAM, 0);
 	sfd = socket(AF_INET, SOCK_STREAM, 0);
 
 	ret = bind(sfd, &addr, sizeof(addr));
@@ -108,26 +103,37 @@ FIXTURE_SETUP(tls_basic)
 	ret = getsockname(sfd, &addr, &len);
 	ASSERT_EQ(ret, 0);
 
-	ret = connect(self->fd, &addr, sizeof(addr));
+	ret = connect(*fd, &addr, sizeof(addr));
 	ASSERT_EQ(ret, 0);
 
-	self->cfd = accept(sfd, &addr, &len);
-	ASSERT_GE(self->cfd, 0);
+	*cfd = accept(sfd, &addr, &len);
+	ASSERT_GE(*cfd, 0);
 
 	close(sfd);
 
-	ret = setsockopt(self->fd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls"));
+	ret = setsockopt(*fd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls"));
 	if (ret != 0) {
 		ASSERT_EQ(errno, ENOENT);
-		self->notls = true;
+		*notls = true;
 		printf("Failure setting TCP_ULP, testing without tls\n");
 		return;
 	}
 
-	ret = setsockopt(self->cfd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls"));
+	ret = setsockopt(*cfd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls"));
 	ASSERT_EQ(ret, 0);
 }
 
+FIXTURE(tls_basic)
+{
+	int fd, cfd;
+	bool notls;
+};
+
+FIXTURE_SETUP(tls_basic)
+{
+	ulp_sock_pair(_metadata, &self->fd, &self->cfd, &self->notls);
+}
+
 FIXTURE_TEARDOWN(tls_basic)
 {
 	close(self->fd);
@@ -199,60 +205,21 @@ FIXTURE_VARIANT_ADD(tls, 13_sm4_ccm)
 FIXTURE_SETUP(tls)
 {
 	struct tls_crypto_info_keys tls12;
-	struct sockaddr_in addr;
-	socklen_t len;
-	int sfd, ret;
-
-	self->notls = false;
-	len = sizeof(addr);
+	int ret;
 
 	tls_crypto_info_init(variant->tls_version, variant->cipher_type,
 			     &tls12);
 
-	addr.sin_family = AF_INET;
-	addr.sin_addr.s_addr = htonl(INADDR_ANY);
-	addr.sin_port = 0;
+	ulp_sock_pair(_metadata, &self->fd, &self->cfd, &self->notls);
 
-	self->fd = socket(AF_INET, SOCK_STREAM, 0);
-	sfd = socket(AF_INET, SOCK_STREAM, 0);
-
-	ret = bind(sfd, &addr, sizeof(addr));
-	ASSERT_EQ(ret, 0);
-	ret = listen(sfd, 10);
-	ASSERT_EQ(ret, 0);
+	if (self->notls)
+		return;
 
-	ret = getsockname(sfd, &addr, &len);
+	ret = setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len);
 	ASSERT_EQ(ret, 0);
 
-	ret = connect(self->fd, &addr, sizeof(addr));
+	ret = setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len);
 	ASSERT_EQ(ret, 0);
-
-	ret = setsockopt(self->fd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls"));
-	if (ret != 0) {
-		self->notls = true;
-		printf("Failure setting TCP_ULP, testing without tls\n");
-	}
-
-	if (!self->notls) {
-		ret = setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12,
-				 tls12.len);
-		ASSERT_EQ(ret, 0);
-	}
-
-	self->cfd = accept(sfd, &addr, &len);
-	ASSERT_GE(self->cfd, 0);
-
-	if (!self->notls) {
-		ret = setsockopt(self->cfd, IPPROTO_TCP, TCP_ULP, "tls",
-				 sizeof("tls"));
-		ASSERT_EQ(ret, 0);
-
-		ret = setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12,
-				 tls12.len);
-		ASSERT_EQ(ret, 0);
-	}
-
-	close(sfd);
 }
 
 FIXTURE_TEARDOWN(tls)
@@ -1355,62 +1322,25 @@ TEST(non_established) {
 
 TEST(keysizes) {
 	struct tls12_crypto_info_aes_gcm_256 tls12;
-	struct sockaddr_in addr;
-	int sfd, ret, fd, cfd;
-	socklen_t len;
+	int ret, fd, cfd;
 	bool notls;
 
-	notls = false;
-	len = sizeof(addr);
-
 	memset(&tls12, 0, sizeof(tls12));
 	tls12.info.version = TLS_1_2_VERSION;
 	tls12.info.cipher_type = TLS_CIPHER_AES_GCM_256;
 
-	addr.sin_family = AF_INET;
-	addr.sin_addr.s_addr = htonl(INADDR_ANY);
-	addr.sin_port = 0;
-
-	fd = socket(AF_INET, SOCK_STREAM, 0);
-	sfd = socket(AF_INET, SOCK_STREAM, 0);
-
-	ret = bind(sfd, &addr, sizeof(addr));
-	ASSERT_EQ(ret, 0);
-	ret = listen(sfd, 10);
-	ASSERT_EQ(ret, 0);
-
-	ret = getsockname(sfd, &addr, &len);
-	ASSERT_EQ(ret, 0);
-
-	ret = connect(fd, &addr, sizeof(addr));
-	ASSERT_EQ(ret, 0);
-
-	ret = setsockopt(fd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls"));
-	if (ret != 0) {
-		notls = true;
-		printf("Failure setting TCP_ULP, testing without tls\n");
-	}
+	ulp_sock_pair(_metadata, &fd, &cfd, &notls);
 
 	if (!notls) {
 		ret = setsockopt(fd, SOL_TLS, TLS_TX, &tls12,
 				 sizeof(tls12));
 		EXPECT_EQ(ret, 0);
-	}
-
-	cfd = accept(sfd, &addr, &len);
-	ASSERT_GE(cfd, 0);
-
-	if (!notls) {
-		ret = setsockopt(cfd, IPPROTO_TCP, TCP_ULP, "tls",
-				 sizeof("tls"));
-		EXPECT_EQ(ret, 0);
 
 		ret = setsockopt(cfd, SOL_TLS, TLS_RX, &tls12,
 				 sizeof(tls12));
 		EXPECT_EQ(ret, 0);
 	}
 
-	close(sfd);
 	close(fd);
 	close(cfd);
 }
-- 
cgit v1.2.3


From 31180adb0bed7d3e67ae97121dc83f320bbd45be Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 24 Nov 2021 15:25:50 -0800
Subject: selftests: tls: factor out cmsg send/receive

Add helpers for sending and receiving special record types.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/tls.c | 111 ++++++++++++++++++++++++--------------
 1 file changed, 70 insertions(+), 41 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c
index 8fb7cf8c4bfb..642d1d629b28 100644
--- a/tools/testing/selftests/net/tls.c
+++ b/tools/testing/selftests/net/tls.c
@@ -123,6 +123,65 @@ static void ulp_sock_pair(struct __test_metadata *_metadata,
 	ASSERT_EQ(ret, 0);
 }
 
+/* Produce a basic cmsg */
+static int tls_send_cmsg(int fd, unsigned char record_type,
+			 void *data, size_t len, int flags)
+{
+	char cbuf[CMSG_SPACE(sizeof(char))];
+	int cmsg_len = sizeof(char);
+	struct cmsghdr *cmsg;
+	struct msghdr msg;
+	struct iovec vec;
+
+	vec.iov_base = data;
+	vec.iov_len = len;
+	memset(&msg, 0, sizeof(struct msghdr));
+	msg.msg_iov = &vec;
+	msg.msg_iovlen = 1;
+	msg.msg_control = cbuf;
+	msg.msg_controllen = sizeof(cbuf);
+	cmsg = CMSG_FIRSTHDR(&msg);
+	cmsg->cmsg_level = SOL_TLS;
+	/* test sending non-record types. */
+	cmsg->cmsg_type = TLS_SET_RECORD_TYPE;
+	cmsg->cmsg_len = CMSG_LEN(cmsg_len);
+	*CMSG_DATA(cmsg) = record_type;
+	msg.msg_controllen = cmsg->cmsg_len;
+
+	return sendmsg(fd, &msg, flags);
+}
+
+static int tls_recv_cmsg(struct __test_metadata *_metadata,
+			 int fd, unsigned char record_type,
+			 void *data, size_t len, int flags)
+{
+	char cbuf[CMSG_SPACE(sizeof(char))];
+	struct cmsghdr *cmsg;
+	unsigned char ctype;
+	struct msghdr msg;
+	struct iovec vec;
+	int n;
+
+	vec.iov_base = data;
+	vec.iov_len = len;
+	memset(&msg, 0, sizeof(struct msghdr));
+	msg.msg_iov = &vec;
+	msg.msg_iovlen = 1;
+	msg.msg_control = cbuf;
+	msg.msg_controllen = sizeof(cbuf);
+
+	n = recvmsg(fd, &msg, flags);
+
+	cmsg = CMSG_FIRSTHDR(&msg);
+	EXPECT_NE(cmsg, NULL);
+	EXPECT_EQ(cmsg->cmsg_level, SOL_TLS);
+	EXPECT_EQ(cmsg->cmsg_type, TLS_GET_RECORD_TYPE);
+	ctype = *((unsigned char *)CMSG_DATA(cmsg));
+	EXPECT_EQ(ctype, record_type);
+
+	return n;
+}
+
 FIXTURE(tls_basic)
 {
 	int fd, cfd;
@@ -1160,60 +1219,30 @@ TEST_F(tls, mutliproc_sendpage_writers)
 
 TEST_F(tls, control_msg)
 {
-	if (self->notls)
-		return;
-
-	char cbuf[CMSG_SPACE(sizeof(char))];
-	char const *test_str = "test_read";
-	int cmsg_len = sizeof(char);
+	char *test_str = "test_read";
 	char record_type = 100;
-	struct cmsghdr *cmsg;
-	struct msghdr msg;
 	int send_len = 10;
-	struct iovec vec;
 	char buf[10];
 
-	vec.iov_base = (char *)test_str;
-	vec.iov_len = 10;
-	memset(&msg, 0, sizeof(struct msghdr));
-	msg.msg_iov = &vec;
-	msg.msg_iovlen = 1;
-	msg.msg_control = cbuf;
-	msg.msg_controllen = sizeof(cbuf);
-	cmsg = CMSG_FIRSTHDR(&msg);
-	cmsg->cmsg_level = SOL_TLS;
-	/* test sending non-record types. */
-	cmsg->cmsg_type = TLS_SET_RECORD_TYPE;
-	cmsg->cmsg_len = CMSG_LEN(cmsg_len);
-	*CMSG_DATA(cmsg) = record_type;
-	msg.msg_controllen = cmsg->cmsg_len;
+	if (self->notls)
+		SKIP(return, "no TLS support");
 
-	EXPECT_EQ(sendmsg(self->fd, &msg, 0), send_len);
+	EXPECT_EQ(tls_send_cmsg(self->fd, record_type, test_str, send_len, 0),
+		  send_len);
 	/* Should fail because we didn't provide a control message */
 	EXPECT_EQ(recv(self->cfd, buf, send_len, 0), -1);
 
-	vec.iov_base = buf;
-	EXPECT_EQ(recvmsg(self->cfd, &msg, MSG_WAITALL | MSG_PEEK), send_len);
-
-	cmsg = CMSG_FIRSTHDR(&msg);
-	EXPECT_NE(cmsg, NULL);
-	EXPECT_EQ(cmsg->cmsg_level, SOL_TLS);
-	EXPECT_EQ(cmsg->cmsg_type, TLS_GET_RECORD_TYPE);
-	record_type = *((unsigned char *)CMSG_DATA(cmsg));
-	EXPECT_EQ(record_type, 100);
+	EXPECT_EQ(tls_recv_cmsg(_metadata, self->cfd, record_type,
+				buf, sizeof(buf), MSG_WAITALL | MSG_PEEK),
+		  send_len);
 	EXPECT_EQ(memcmp(buf, test_str, send_len), 0);
 
 	/* Recv the message again without MSG_PEEK */
-	record_type = 0;
 	memset(buf, 0, sizeof(buf));
 
-	EXPECT_EQ(recvmsg(self->cfd, &msg, MSG_WAITALL), send_len);
-	cmsg = CMSG_FIRSTHDR(&msg);
-	EXPECT_NE(cmsg, NULL);
-	EXPECT_EQ(cmsg->cmsg_level, SOL_TLS);
-	EXPECT_EQ(cmsg->cmsg_type, TLS_GET_RECORD_TYPE);
-	record_type = *((unsigned char *)CMSG_DATA(cmsg));
-	EXPECT_EQ(record_type, 100);
+	EXPECT_EQ(tls_recv_cmsg(_metadata, self->cfd, record_type,
+				buf, sizeof(buf), MSG_WAITALL),
+		  send_len);
 	EXPECT_EQ(memcmp(buf, test_str, send_len), 0);
 }
 
-- 
cgit v1.2.3


From ef0fc0b3cc2b8df7f29656a190a13ceb644b0c00 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 24 Nov 2021 15:25:51 -0800
Subject: selftests: tls: add tests for handling of bad records

Test broken records.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/tls.c | 154 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 154 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c
index 642d1d629b28..2108b197d3f6 100644
--- a/tools/testing/selftests/net/tls.c
+++ b/tools/testing/selftests/net/tls.c
@@ -1297,6 +1297,160 @@ TEST_F(tls, shutdown_reuse)
 	EXPECT_EQ(errno, EISCONN);
 }
 
+FIXTURE(tls_err)
+{
+	int fd, cfd;
+	int fd2, cfd2;
+	bool notls;
+};
+
+FIXTURE_VARIANT(tls_err)
+{
+	uint16_t tls_version;
+};
+
+FIXTURE_VARIANT_ADD(tls_err, 12_aes_gcm)
+{
+	.tls_version = TLS_1_2_VERSION,
+};
+
+FIXTURE_VARIANT_ADD(tls_err, 13_aes_gcm)
+{
+	.tls_version = TLS_1_3_VERSION,
+};
+
+FIXTURE_SETUP(tls_err)
+{
+	struct tls_crypto_info_keys tls12;
+	int ret;
+
+	tls_crypto_info_init(variant->tls_version, TLS_CIPHER_AES_GCM_128,
+			     &tls12);
+
+	ulp_sock_pair(_metadata, &self->fd, &self->cfd, &self->notls);
+	ulp_sock_pair(_metadata, &self->fd2, &self->cfd2, &self->notls);
+	if (self->notls)
+		return;
+
+	ret = setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len);
+	ASSERT_EQ(ret, 0);
+
+	ret = setsockopt(self->cfd2, SOL_TLS, TLS_RX, &tls12, tls12.len);
+	ASSERT_EQ(ret, 0);
+}
+
+FIXTURE_TEARDOWN(tls_err)
+{
+	close(self->fd);
+	close(self->cfd);
+	close(self->fd2);
+	close(self->cfd2);
+}
+
+TEST_F(tls_err, bad_rec)
+{
+	char buf[64];
+
+	if (self->notls)
+		SKIP(return, "no TLS support");
+
+	memset(buf, 0x55, sizeof(buf));
+	EXPECT_EQ(send(self->fd2, buf, sizeof(buf), 0), sizeof(buf));
+	EXPECT_EQ(recv(self->cfd2, buf, sizeof(buf), 0), -1);
+	EXPECT_EQ(errno, EMSGSIZE);
+	EXPECT_EQ(recv(self->cfd2, buf, sizeof(buf), MSG_DONTWAIT), -1);
+	EXPECT_EQ(errno, EAGAIN);
+}
+
+TEST_F(tls_err, bad_auth)
+{
+	char buf[128];
+	int n;
+
+	if (self->notls)
+		SKIP(return, "no TLS support");
+
+	memrnd(buf, sizeof(buf) / 2);
+	EXPECT_EQ(send(self->fd, buf, sizeof(buf) / 2, 0), sizeof(buf) / 2);
+	n = recv(self->cfd, buf, sizeof(buf), 0);
+	EXPECT_GT(n, sizeof(buf) / 2);
+
+	buf[n - 1]++;
+
+	EXPECT_EQ(send(self->fd2, buf, n, 0), n);
+	EXPECT_EQ(recv(self->cfd2, buf, sizeof(buf), 0), -1);
+	EXPECT_EQ(errno, EBADMSG);
+	EXPECT_EQ(recv(self->cfd2, buf, sizeof(buf), 0), -1);
+	EXPECT_EQ(errno, EBADMSG);
+}
+
+TEST_F(tls_err, bad_in_large_read)
+{
+	char txt[3][64];
+	char cip[3][128];
+	char buf[3 * 128];
+	int i, n;
+
+	if (self->notls)
+		SKIP(return, "no TLS support");
+
+	/* Put 3 records in the sockets */
+	for (i = 0; i < 3; i++) {
+		memrnd(txt[i], sizeof(txt[i]));
+		EXPECT_EQ(send(self->fd, txt[i], sizeof(txt[i]), 0),
+			  sizeof(txt[i]));
+		n = recv(self->cfd, cip[i], sizeof(cip[i]), 0);
+		EXPECT_GT(n, sizeof(txt[i]));
+		/* Break the third message */
+		if (i == 2)
+			cip[2][n - 1]++;
+		EXPECT_EQ(send(self->fd2, cip[i], n, 0), n);
+	}
+
+	/* We should be able to receive the first two messages */
+	EXPECT_EQ(recv(self->cfd2, buf, sizeof(buf), 0), sizeof(txt[0]) * 2);
+	EXPECT_EQ(memcmp(buf, txt[0], sizeof(txt[0])), 0);
+	EXPECT_EQ(memcmp(buf + sizeof(txt[0]), txt[1], sizeof(txt[1])), 0);
+	/* Third mesasge is bad */
+	EXPECT_EQ(recv(self->cfd2, buf, sizeof(buf), 0), -1);
+	EXPECT_EQ(errno, EBADMSG);
+	EXPECT_EQ(recv(self->cfd2, buf, sizeof(buf), 0), -1);
+	EXPECT_EQ(errno, EBADMSG);
+}
+
+TEST_F(tls_err, bad_cmsg)
+{
+	char *test_str = "test_read";
+	int send_len = 10;
+	char cip[128];
+	char buf[128];
+	char txt[64];
+	int n;
+
+	if (self->notls)
+		SKIP(return, "no TLS support");
+
+	/* Queue up one data record */
+	memrnd(txt, sizeof(txt));
+	EXPECT_EQ(send(self->fd, txt, sizeof(txt), 0), sizeof(txt));
+	n = recv(self->cfd, cip, sizeof(cip), 0);
+	EXPECT_GT(n, sizeof(txt));
+	EXPECT_EQ(send(self->fd2, cip, n, 0), n);
+
+	EXPECT_EQ(tls_send_cmsg(self->fd, 100, test_str, send_len, 0), 10);
+	n = recv(self->cfd, cip, sizeof(cip), 0);
+	cip[n - 1]++; /* Break it */
+	EXPECT_GT(n, send_len);
+	EXPECT_EQ(send(self->fd2, cip, n, 0), n);
+
+	EXPECT_EQ(recv(self->cfd2, buf, sizeof(buf), 0), sizeof(txt));
+	EXPECT_EQ(memcmp(buf, txt, sizeof(txt)), 0);
+	EXPECT_EQ(recv(self->cfd2, buf, sizeof(buf), 0), -1);
+	EXPECT_EQ(errno, EBADMSG);
+	EXPECT_EQ(recv(self->cfd2, buf, sizeof(buf), 0), -1);
+	EXPECT_EQ(errno, EBADMSG);
+}
+
 TEST(non_established) {
 	struct tls12_crypto_info_aes_gcm_256 tls12;
 	struct sockaddr_in addr;
-- 
cgit v1.2.3


From d87d67fd61ef4e7619eb24ab20cba3250a6444e8 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 24 Nov 2021 15:25:53 -0800
Subject: selftests: tls: test splicing cmsgs

Make sure we correctly reject splicing non-data records.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/tls.c | 40 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c
index 2108b197d3f6..3dfa9d7dd4cc 100644
--- a/tools/testing/selftests/net/tls.c
+++ b/tools/testing/selftests/net/tls.c
@@ -639,6 +639,46 @@ TEST_F(tls, splice_to_pipe)
 	EXPECT_EQ(memcmp(mem_send, mem_recv, send_len), 0);
 }
 
+TEST_F(tls, splice_cmsg_to_pipe)
+{
+	char *test_str = "test_read";
+	char record_type = 100;
+	int send_len = 10;
+	char buf[10];
+	int p[2];
+
+	ASSERT_GE(pipe(p), 0);
+	EXPECT_EQ(tls_send_cmsg(self->fd, 100, test_str, send_len, 0), 10);
+	EXPECT_EQ(splice(self->cfd, NULL, p[1], NULL, send_len, 0), -1);
+	EXPECT_EQ(errno, EINVAL);
+	EXPECT_EQ(recv(self->cfd, buf, send_len, 0), -1);
+	EXPECT_EQ(errno, EIO);
+	EXPECT_EQ(tls_recv_cmsg(_metadata, self->cfd, record_type,
+				buf, sizeof(buf), MSG_WAITALL),
+		  send_len);
+	EXPECT_EQ(memcmp(test_str, buf, send_len), 0);
+}
+
+TEST_F(tls, splice_dec_cmsg_to_pipe)
+{
+	char *test_str = "test_read";
+	char record_type = 100;
+	int send_len = 10;
+	char buf[10];
+	int p[2];
+
+	ASSERT_GE(pipe(p), 0);
+	EXPECT_EQ(tls_send_cmsg(self->fd, 100, test_str, send_len, 0), 10);
+	EXPECT_EQ(recv(self->cfd, buf, send_len, 0), -1);
+	EXPECT_EQ(errno, EIO);
+	EXPECT_EQ(splice(self->cfd, NULL, p[1], NULL, send_len, 0), -1);
+	EXPECT_EQ(errno, EINVAL);
+	EXPECT_EQ(tls_recv_cmsg(_metadata, self->cfd, record_type,
+				buf, sizeof(buf), MSG_WAITALL),
+		  send_len);
+	EXPECT_EQ(memcmp(test_str, buf, send_len), 0);
+}
+
 TEST_F(tls, recvmsg_single)
 {
 	char const *test_str = "test_recvmsg_single";
-- 
cgit v1.2.3


From 274af0f9e2799da22a866139d9bd203cc0baacff Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 24 Nov 2021 15:25:55 -0800
Subject: selftests: tls: test splicing decrypted records

Add tests for half-received and peeked records.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/tls.c | 49 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c
index 3dfa9d7dd4cc..6e78d7207cc1 100644
--- a/tools/testing/selftests/net/tls.c
+++ b/tools/testing/selftests/net/tls.c
@@ -679,6 +679,55 @@ TEST_F(tls, splice_dec_cmsg_to_pipe)
 	EXPECT_EQ(memcmp(test_str, buf, send_len), 0);
 }
 
+TEST_F(tls, recv_and_splice)
+{
+	int send_len = TLS_PAYLOAD_MAX_LEN;
+	char mem_send[TLS_PAYLOAD_MAX_LEN];
+	char mem_recv[TLS_PAYLOAD_MAX_LEN];
+	int half = send_len / 2;
+	int p[2];
+
+	ASSERT_GE(pipe(p), 0);
+	EXPECT_EQ(send(self->fd, mem_send, send_len, 0), send_len);
+	/* Recv hald of the record, splice the other half */
+	EXPECT_EQ(recv(self->cfd, mem_recv, half, MSG_WAITALL), half);
+	EXPECT_EQ(splice(self->cfd, NULL, p[1], NULL, half, SPLICE_F_NONBLOCK),
+		  half);
+	EXPECT_EQ(read(p[0], &mem_recv[half], half), half);
+	EXPECT_EQ(memcmp(mem_send, mem_recv, send_len), 0);
+}
+
+TEST_F(tls, peek_and_splice)
+{
+	int send_len = TLS_PAYLOAD_MAX_LEN;
+	char mem_send[TLS_PAYLOAD_MAX_LEN];
+	char mem_recv[TLS_PAYLOAD_MAX_LEN];
+	int chunk = TLS_PAYLOAD_MAX_LEN / 4;
+	int n, i, p[2];
+
+	memrnd(mem_send, sizeof(mem_send));
+
+	ASSERT_GE(pipe(p), 0);
+	for (i = 0; i < 4; i++)
+		EXPECT_EQ(send(self->fd, &mem_send[chunk * i], chunk, 0),
+			  chunk);
+
+	EXPECT_EQ(recv(self->cfd, mem_recv, chunk * 5 / 2,
+		       MSG_WAITALL | MSG_PEEK),
+		  chunk * 5 / 2);
+	EXPECT_EQ(memcmp(mem_send, mem_recv, chunk * 5 / 2), 0);
+
+	n = 0;
+	while (n < send_len) {
+		i = splice(self->cfd, NULL, p[1], NULL, send_len - n, 0);
+		EXPECT_GT(i, 0);
+		n += i;
+	}
+	EXPECT_EQ(n, send_len);
+	EXPECT_EQ(read(p[0], mem_recv, send_len), send_len);
+	EXPECT_EQ(memcmp(mem_send, mem_recv, send_len), 0);
+}
+
 TEST_F(tls, recvmsg_single)
 {
 	char const *test_str = "test_recvmsg_single";
-- 
cgit v1.2.3


From f884a3426291b3f368231c0ac0e50a6a061f6aca Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 24 Nov 2021 15:25:57 -0800
Subject: selftests: tls: test for correct proto_ops

Previous patch fixes overriding callbacks incorrectly. Triggering
the crash in sendpage_locked would be more spectacular but it's
hard to get to, so take the easier path of proving this is broken
and call getname. We're currently getting IPv4 socket info on an
IPv6 socket.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/tls.c | 55 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c
index 6e78d7207cc1..8a22db0cca49 100644
--- a/tools/testing/selftests/net/tls.c
+++ b/tools/testing/selftests/net/tls.c
@@ -1617,4 +1617,59 @@ TEST(keysizes) {
 	close(cfd);
 }
 
+TEST(tls_v6ops) {
+	struct tls_crypto_info_keys tls12;
+	struct sockaddr_in6 addr, addr2;
+	int sfd, ret, fd;
+	socklen_t len, len2;
+
+	tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_GCM_128, &tls12);
+
+	addr.sin6_family = AF_INET6;
+	addr.sin6_addr = in6addr_any;
+	addr.sin6_port = 0;
+
+	fd = socket(AF_INET6, SOCK_STREAM, 0);
+	sfd = socket(AF_INET6, SOCK_STREAM, 0);
+
+	ret = bind(sfd, &addr, sizeof(addr));
+	ASSERT_EQ(ret, 0);
+	ret = listen(sfd, 10);
+	ASSERT_EQ(ret, 0);
+
+	len = sizeof(addr);
+	ret = getsockname(sfd, &addr, &len);
+	ASSERT_EQ(ret, 0);
+
+	ret = connect(fd, &addr, sizeof(addr));
+	ASSERT_EQ(ret, 0);
+
+	len = sizeof(addr);
+	ret = getsockname(fd, &addr, &len);
+	ASSERT_EQ(ret, 0);
+
+	ret = setsockopt(fd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls"));
+	if (ret) {
+		ASSERT_EQ(errno, ENOENT);
+		SKIP(return, "no TLS support");
+	}
+	ASSERT_EQ(ret, 0);
+
+	ret = setsockopt(fd, SOL_TLS, TLS_TX, &tls12, tls12.len);
+	ASSERT_EQ(ret, 0);
+
+	ret = setsockopt(fd, SOL_TLS, TLS_RX, &tls12, tls12.len);
+	ASSERT_EQ(ret, 0);
+
+	len2 = sizeof(addr2);
+	ret = getsockname(fd, &addr2, &len2);
+	ASSERT_EQ(ret, 0);
+
+	EXPECT_EQ(len2, len);
+	EXPECT_EQ(memcmp(&addr, &addr2, len), 0);
+
+	close(fd);
+	close(sfd);
+}
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From 4916ea8b06a594d9d4e0cf5aa434e9ebd56baafb Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 18 Nov 2021 05:23:20 -0500
Subject: selftests: fix check for circular KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM

KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM leaves the source VM in a dead state,
so migrating back to the original source VM fails the ioctl.  Adjust
the test.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
index 5ba325cd64bf..a66b9be30239 100644
--- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
+++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
@@ -89,7 +89,7 @@ static void test_sev_migrate_from(bool es)
 {
 	struct kvm_vm *src_vm;
 	struct kvm_vm *dst_vms[NR_MIGRATE_TEST_VMS];
-	int i;
+	int i, ret;
 
 	src_vm = sev_vm_create(es);
 	for (i = 0; i < NR_MIGRATE_TEST_VMS; ++i)
@@ -102,7 +102,10 @@ static void test_sev_migrate_from(bool es)
 		sev_migrate_from(dst_vms[i]->fd, dst_vms[i - 1]->fd);
 
 	/* Migrate the guest back to the original VM. */
-	sev_migrate_from(src_vm->fd, dst_vms[NR_MIGRATE_TEST_VMS - 1]->fd);
+	ret = __sev_migrate_from(src_vm->fd, dst_vms[NR_MIGRATE_TEST_VMS - 1]->fd);
+	TEST_ASSERT(ret == -1 && errno == EIO,
+		    "VM that was migrated from should be dead. ret %d, errno: %d\n", ret,
+		    errno);
 
 	kvm_vm_free(src_vm);
 	for (i = 0; i < NR_MIGRATE_TEST_VMS; ++i)
-- 
cgit v1.2.3


From 826bff439ff8dd8670a313c4a8d378fca0d5df3e Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 17 Nov 2021 10:49:24 -0500
Subject: selftests: sev_migrate_tests: free all VMs

Ensure that the ASID are freed promptly, which becomes more important
when more tests are added to this file.

Cc: Peter Gonda <pgonda@google.com>
Cc: Sean Christopherson <seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
index a66b9be30239..0cd7e2eaa895 100644
--- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
+++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
@@ -149,6 +149,8 @@ static void test_sev_migrate_locking(void)
 
 	for (i = 0; i < NR_LOCK_TESTING_THREADS; ++i)
 		pthread_join(pt[i], NULL);
+	for (i = 0; i < NR_LOCK_TESTING_THREADS; ++i)
+		kvm_vm_free(input[i].vm);
 }
 
 static void test_sev_migrate_parameters(void)
@@ -165,7 +167,6 @@ static void test_sev_migrate_parameters(void)
 	sev_ioctl(sev_es_vm_no_vmsa->fd, KVM_SEV_ES_INIT, NULL);
 	vm_vcpu_add(sev_es_vm_no_vmsa, 1);
 
-
 	ret = __sev_migrate_from(sev_vm->fd, sev_es_vm->fd);
 	TEST_ASSERT(
 		ret == -1 && errno == EINVAL,
@@ -194,6 +195,12 @@ static void test_sev_migrate_parameters(void)
 	TEST_ASSERT(ret == -1 && errno == EINVAL,
 		    "Migrations require SEV enabled. ret %d, errno: %d\n", ret,
 		    errno);
+
+	kvm_vm_free(sev_vm);
+	kvm_vm_free(sev_es_vm);
+	kvm_vm_free(sev_es_vm_no_vmsa);
+	kvm_vm_free(vm_no_vcpu);
+	kvm_vm_free(vm_no_sev);
 }
 
 int main(int argc, char *argv[])
-- 
cgit v1.2.3


From 6c1186430a808f97e2052bd5d9eff12c5d5defb0 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Mon, 22 Nov 2021 18:58:17 +0100
Subject: KVM: selftests: Avoid KVM_SET_CPUID2 after KVM_RUN in hyperv_features
 test

hyperv_features's sole purpose is to test access to various Hyper-V MSRs
and hypercalls with different CPUID data. As KVM_SET_CPUID2 after KVM_RUN
is deprecated and soon-to-be forbidden, avoid it by re-creating test VM
for each sub-test.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Message-Id: <20211122175818.608220-2-vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 .../testing/selftests/kvm/x86_64/hyperv_features.c | 140 +++++++++++----------
 1 file changed, 71 insertions(+), 69 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c
index 91d88aaa9899..672915ce73d8 100644
--- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c
@@ -165,10 +165,10 @@ static void hv_set_cpuid(struct kvm_vm *vm, struct kvm_cpuid2 *cpuid,
 	vcpu_set_cpuid(vm, VCPU_ID, cpuid);
 }
 
-static void guest_test_msrs_access(struct kvm_vm *vm, struct msr_data *msr,
-				   struct kvm_cpuid2 *best)
+static void guest_test_msrs_access(void)
 {
 	struct kvm_run *run;
+	struct kvm_vm *vm;
 	struct ucall uc;
 	int stage = 0, r;
 	struct kvm_cpuid_entry2 feat = {
@@ -180,11 +180,34 @@ static void guest_test_msrs_access(struct kvm_vm *vm, struct msr_data *msr,
 	struct kvm_cpuid_entry2 dbg = {
 		.function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES
 	};
-	struct kvm_enable_cap cap = {0};
-
-	run = vcpu_state(vm, VCPU_ID);
+	struct kvm_cpuid2 *best;
+	vm_vaddr_t msr_gva;
+	struct kvm_enable_cap cap = {
+		.cap = KVM_CAP_HYPERV_ENFORCE_CPUID,
+		.args = {1}
+	};
+	struct msr_data *msr;
 
 	while (true) {
+		vm = vm_create_default(VCPU_ID, 0, guest_msr);
+
+		msr_gva = vm_vaddr_alloc_page(vm);
+		memset(addr_gva2hva(vm, msr_gva), 0x0, getpagesize());
+		msr = addr_gva2hva(vm, msr_gva);
+
+		vcpu_args_set(vm, VCPU_ID, 1, msr_gva);
+		vcpu_enable_cap(vm, VCPU_ID, &cap);
+
+		vcpu_set_hv_cpuid(vm, VCPU_ID);
+
+		best = kvm_get_supported_hv_cpuid();
+
+		vm_init_descriptor_tables(vm);
+		vcpu_init_descriptor_tables(vm, VCPU_ID);
+		vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler);
+
+		run = vcpu_state(vm, VCPU_ID);
+
 		switch (stage) {
 		case 0:
 			/*
@@ -315,6 +338,7 @@ static void guest_test_msrs_access(struct kvm_vm *vm, struct msr_data *msr,
 			 * capability enabled and guest visible CPUID bit unset.
 			 */
 			cap.cap = KVM_CAP_HYPERV_SYNIC2;
+			cap.args[0] = 0;
 			vcpu_enable_cap(vm, VCPU_ID, &cap);
 			break;
 		case 22:
@@ -461,9 +485,9 @@ static void guest_test_msrs_access(struct kvm_vm *vm, struct msr_data *msr,
 
 		switch (get_ucall(vm, VCPU_ID, &uc)) {
 		case UCALL_SYNC:
-			TEST_ASSERT(uc.args[1] == stage,
-				    "Unexpected stage: %ld (%d expected)\n",
-				    uc.args[1], stage);
+			TEST_ASSERT(uc.args[1] == 0,
+				    "Unexpected stage: %ld (0 expected)\n",
+				    uc.args[1]);
 			break;
 		case UCALL_ABORT:
 			TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
@@ -474,13 +498,14 @@ static void guest_test_msrs_access(struct kvm_vm *vm, struct msr_data *msr,
 		}
 
 		stage++;
+		kvm_vm_free(vm);
 	}
 }
 
-static void guest_test_hcalls_access(struct kvm_vm *vm, struct hcall_data *hcall,
-				     void *input, void *output, struct kvm_cpuid2 *best)
+static void guest_test_hcalls_access(void)
 {
 	struct kvm_run *run;
+	struct kvm_vm *vm;
 	struct ucall uc;
 	int stage = 0, r;
 	struct kvm_cpuid_entry2 feat = {
@@ -493,10 +518,38 @@ static void guest_test_hcalls_access(struct kvm_vm *vm, struct hcall_data *hcall
 	struct kvm_cpuid_entry2 dbg = {
 		.function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES
 	};
-
-	run = vcpu_state(vm, VCPU_ID);
+	struct kvm_enable_cap cap = {
+		.cap = KVM_CAP_HYPERV_ENFORCE_CPUID,
+		.args = {1}
+	};
+	vm_vaddr_t hcall_page, hcall_params;
+	struct hcall_data *hcall;
+	struct kvm_cpuid2 *best;
 
 	while (true) {
+		vm = vm_create_default(VCPU_ID, 0, guest_hcall);
+
+		vm_init_descriptor_tables(vm);
+		vcpu_init_descriptor_tables(vm, VCPU_ID);
+		vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler);
+
+		/* Hypercall input/output */
+		hcall_page = vm_vaddr_alloc_pages(vm, 2);
+		hcall = addr_gva2hva(vm, hcall_page);
+		memset(addr_gva2hva(vm, hcall_page), 0x0, 2 * getpagesize());
+
+		hcall_params = vm_vaddr_alloc_page(vm);
+		memset(addr_gva2hva(vm, hcall_params), 0x0, getpagesize());
+
+		vcpu_args_set(vm, VCPU_ID, 2, addr_gva2gpa(vm, hcall_page), hcall_params);
+		vcpu_enable_cap(vm, VCPU_ID, &cap);
+
+		vcpu_set_hv_cpuid(vm, VCPU_ID);
+
+		best = kvm_get_supported_hv_cpuid();
+
+		run = vcpu_state(vm, VCPU_ID);
+
 		switch (stage) {
 		case 0:
 			hcall->control = 0xdeadbeef;
@@ -606,9 +659,9 @@ static void guest_test_hcalls_access(struct kvm_vm *vm, struct hcall_data *hcall
 
 		switch (get_ucall(vm, VCPU_ID, &uc)) {
 		case UCALL_SYNC:
-			TEST_ASSERT(uc.args[1] == stage,
-				    "Unexpected stage: %ld (%d expected)\n",
-				    uc.args[1], stage);
+			TEST_ASSERT(uc.args[1] == 0,
+				    "Unexpected stage: %ld (0 expected)\n",
+				    uc.args[1]);
 			break;
 		case UCALL_ABORT:
 			TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
@@ -619,66 +672,15 @@ static void guest_test_hcalls_access(struct kvm_vm *vm, struct hcall_data *hcall
 		}
 
 		stage++;
+		kvm_vm_free(vm);
 	}
 }
 
 int main(void)
 {
-	struct kvm_cpuid2 *best;
-	struct kvm_vm *vm;
-	vm_vaddr_t msr_gva, hcall_page, hcall_params;
-	struct kvm_enable_cap cap = {
-		.cap = KVM_CAP_HYPERV_ENFORCE_CPUID,
-		.args = {1}
-	};
-
-	/* Test MSRs */
-	vm = vm_create_default(VCPU_ID, 0, guest_msr);
-
-	msr_gva = vm_vaddr_alloc_page(vm);
-	memset(addr_gva2hva(vm, msr_gva), 0x0, getpagesize());
-	vcpu_args_set(vm, VCPU_ID, 1, msr_gva);
-	vcpu_enable_cap(vm, VCPU_ID, &cap);
-
-	vcpu_set_hv_cpuid(vm, VCPU_ID);
-
-	best = kvm_get_supported_hv_cpuid();
-
-	vm_init_descriptor_tables(vm);
-	vcpu_init_descriptor_tables(vm, VCPU_ID);
-	vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler);
-
 	pr_info("Testing access to Hyper-V specific MSRs\n");
-	guest_test_msrs_access(vm, addr_gva2hva(vm, msr_gva),
-			       best);
-	kvm_vm_free(vm);
-
-	/* Test hypercalls */
-	vm = vm_create_default(VCPU_ID, 0, guest_hcall);
-
-	vm_init_descriptor_tables(vm);
-	vcpu_init_descriptor_tables(vm, VCPU_ID);
-	vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler);
-
-	/* Hypercall input/output */
-	hcall_page = vm_vaddr_alloc_pages(vm, 2);
-	memset(addr_gva2hva(vm, hcall_page), 0x0, 2 * getpagesize());
-
-	hcall_params = vm_vaddr_alloc_page(vm);
-	memset(addr_gva2hva(vm, hcall_params), 0x0, getpagesize());
-
-	vcpu_args_set(vm, VCPU_ID, 2, addr_gva2gpa(vm, hcall_page), hcall_params);
-	vcpu_enable_cap(vm, VCPU_ID, &cap);
-
-	vcpu_set_hv_cpuid(vm, VCPU_ID);
-
-	best = kvm_get_supported_hv_cpuid();
+	guest_test_msrs_access();
 
 	pr_info("Testing access to Hyper-V hypercalls\n");
-	guest_test_hcalls_access(vm, addr_gva2hva(vm, hcall_params),
-				 addr_gva2hva(vm, hcall_page),
-				 addr_gva2hva(vm, hcall_page) + getpagesize(),
-				 best);
-
-	kvm_vm_free(vm);
+	guest_test_hcalls_access();
 }
-- 
cgit v1.2.3


From 908fa88e420f30dde6d80f092795a18ec72ca6d3 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Tue, 23 Nov 2021 14:59:53 +0100
Subject: KVM: selftests: Make sure kvm_create_max_vcpus test won't hit
 RLIMIT_NOFILE

With the elevated 'KVM_CAP_MAX_VCPUS' value kvm_create_max_vcpus test
may hit RLIMIT_NOFILE limits:

 # ./kvm_create_max_vcpus
 KVM_CAP_MAX_VCPU_ID: 4096
 KVM_CAP_MAX_VCPUS: 1024
 Testing creating 1024 vCPUs, with IDs 0...1023.
 /dev/kvm not available (errno: 24), skipping test

Adjust RLIMIT_NOFILE limits to make sure KVM_CAP_MAX_VCPUS fds can be
opened. Note, raising hard limit ('rlim_max') requires CAP_SYS_RESOURCE
capability which is generally not needed to run kvm selftests (but without
raising the limit the test is doomed to fail anyway).

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Message-Id: <20211123135953.667434-1-vkuznets@redhat.com>
[Skip the test if the hard limit can be raised. - Paolo]
Reviewed-by: Sean Christopherson <seanjc@google.com>
Tested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/kvm_create_max_vcpus.c | 30 ++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c
index f968dfd4ee88..aed9dc3ca1e9 100644
--- a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c
+++ b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c
@@ -12,6 +12,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/resource.h>
 
 #include "test_util.h"
 
@@ -40,10 +41,39 @@ int main(int argc, char *argv[])
 {
 	int kvm_max_vcpu_id = kvm_check_cap(KVM_CAP_MAX_VCPU_ID);
 	int kvm_max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS);
+	/*
+	 * Number of file descriptors reqired, KVM_CAP_MAX_VCPUS for vCPU fds +
+	 * an arbitrary number for everything else.
+	 */
+	int nr_fds_wanted = kvm_max_vcpus + 100;
+	struct rlimit rl;
 
 	pr_info("KVM_CAP_MAX_VCPU_ID: %d\n", kvm_max_vcpu_id);
 	pr_info("KVM_CAP_MAX_VCPUS: %d\n", kvm_max_vcpus);
 
+	/*
+	 * Check that we're allowed to open nr_fds_wanted file descriptors and
+	 * try raising the limits if needed.
+	 */
+	TEST_ASSERT(!getrlimit(RLIMIT_NOFILE, &rl), "getrlimit() failed!");
+
+	if (rl.rlim_cur < nr_fds_wanted) {
+		rl.rlim_cur = nr_fds_wanted;
+		if (rl.rlim_max < nr_fds_wanted) {
+			int old_rlim_max = rl.rlim_max;
+			rl.rlim_max = nr_fds_wanted;
+
+			int r = setrlimit(RLIMIT_NOFILE, &rl);
+			if (r < 0) {
+				printf("RLIMIT_NOFILE hard limit is too low (%d, wanted %d)\n",
+				       old_rlim_max, nr_fds_wanted);
+				exit(KSFT_SKIP);
+			}
+		} else {
+			TEST_ASSERT(!setrlimit(RLIMIT_NOFILE, &rl), "setrlimit() failed!");
+		}
+	}
+
 	/*
 	 * Upstream KVM prior to 4.8 does not support KVM_CAP_MAX_VCPU_ID.
 	 * Userspace is supposed to use KVM_CAP_MAX_VCPUS as the maximum ID
-- 
cgit v1.2.3


From e32cb12ff52a2840fc1248998717f7b95c42f064 Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Thu, 25 Nov 2021 09:36:07 +0800
Subject: bpf, mips: Fix build errors about __NR_bpf undeclared
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the __NR_bpf definitions to fix the following build errors for mips:

  $ cd tools/bpf/bpftool
  $ make
  [...]
  bpf.c:54:4: error: #error __NR_bpf not defined. libbpf does not support your arch.
   #  error __NR_bpf not defined. libbpf does not support your arch.
      ^~~~~
  bpf.c: In function ‘sys_bpf’:
  bpf.c:66:17: error: ‘__NR_bpf’ undeclared (first use in this function); did you mean ‘__NR_brk’?
    return syscall(__NR_bpf, cmd, attr, size);
                   ^~~~~~~~
                   __NR_brk
  [...]
  In file included from gen_loader.c:15:0:
  skel_internal.h: In function ‘skel_sys_bpf’:
  skel_internal.h:53:17: error: ‘__NR_bpf’ undeclared (first use in this function); did you mean ‘__NR_brk’?
    return syscall(__NR_bpf, cmd, attr, size);
                   ^~~~~~~~
                   __NR_brk

We can see the following generated definitions:

  $ grep -r "#define __NR_bpf" arch/mips
  arch/mips/include/generated/uapi/asm/unistd_o32.h:#define __NR_bpf (__NR_Linux + 355)
  arch/mips/include/generated/uapi/asm/unistd_n64.h:#define __NR_bpf (__NR_Linux + 315)
  arch/mips/include/generated/uapi/asm/unistd_n32.h:#define __NR_bpf (__NR_Linux + 319)

The __NR_Linux is defined in arch/mips/include/uapi/asm/unistd.h:

  $ grep -r "#define __NR_Linux" arch/mips
  arch/mips/include/uapi/asm/unistd.h:#define __NR_Linux	4000
  arch/mips/include/uapi/asm/unistd.h:#define __NR_Linux	5000
  arch/mips/include/uapi/asm/unistd.h:#define __NR_Linux	6000

That is to say, __NR_bpf is:

  4000 + 355 = 4355 for mips o32,
  6000 + 319 = 6319 for mips n32,
  5000 + 315 = 5315 for mips n64.

So use the GCC pre-defined macro _ABIO32, _ABIN32 and _ABI64 [1] to define
the corresponding __NR_bpf.

This patch is similar with commit bad1926dd2f6 ("bpf, s390: fix build for
libbpf and selftest suite").

  [1] https://gcc.gnu.org/git/?p=gcc.git;a=blob;f=gcc/config/mips/mips.h#l549

Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/1637804167-8323-1-git-send-email-yangtiezhu@loongson.cn
---
 tools/build/feature/test-bpf.c |  6 ++++++
 tools/lib/bpf/bpf.c            |  6 ++++++
 tools/lib/bpf/skel_internal.h  | 10 ++++++++++
 3 files changed, 22 insertions(+)

(limited to 'tools')

diff --git a/tools/build/feature/test-bpf.c b/tools/build/feature/test-bpf.c
index 82070eadfc07..727d22e34a6e 100644
--- a/tools/build/feature/test-bpf.c
+++ b/tools/build/feature/test-bpf.c
@@ -14,6 +14,12 @@
 #  define __NR_bpf 349
 # elif defined(__s390__)
 #  define __NR_bpf 351
+# elif defined(__mips__) && defined(_ABIO32)
+#  define __NR_bpf 4355
+# elif defined(__mips__) && defined(_ABIN32)
+#  define __NR_bpf 6319
+# elif defined(__mips__) && defined(_ABI64)
+#  define __NR_bpf 5315
 # else
 #  error __NR_bpf not defined. libbpf does not support your arch.
 # endif
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 053c86e3d20f..4e7836e1a7b5 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -50,6 +50,12 @@
 #  define __NR_bpf 351
 # elif defined(__arc__)
 #  define __NR_bpf 280
+# elif defined(__mips__) && defined(_ABIO32)
+#  define __NR_bpf 4355
+# elif defined(__mips__) && defined(_ABIN32)
+#  define __NR_bpf 6319
+# elif defined(__mips__) && defined(_ABI64)
+#  define __NR_bpf 5315
 # else
 #  error __NR_bpf not defined. libbpf does not support your arch.
 # endif
diff --git a/tools/lib/bpf/skel_internal.h b/tools/lib/bpf/skel_internal.h
index b206532704ce..0b84d8e6b72a 100644
--- a/tools/lib/bpf/skel_internal.h
+++ b/tools/lib/bpf/skel_internal.h
@@ -7,6 +7,16 @@
 #include <sys/syscall.h>
 #include <sys/mman.h>
 
+#ifndef __NR_bpf
+# if defined(__mips__) && defined(_ABIO32)
+#  define __NR_bpf 4355
+# elif defined(__mips__) && defined(_ABIN32)
+#  define __NR_bpf 6319
+# elif defined(__mips__) && defined(_ABI64)
+#  define __NR_bpf 5315
+# endif
+#endif
+
 /* This file is a base header for auto-generated *.lskel.h files.
  * Its contents will change and may become part of auto-generation in the future.
  *
-- 
cgit v1.2.3


From 71ae450f97ad5e992e5e0431193c767fd624f6e7 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Thu, 25 Nov 2021 16:08:49 +0200
Subject: selftests: net: bridge: add vlan mcast snooping control test

Add the first test for bridge per-vlan multicast snooping which checks
if control of the global and per-vlan options work as expected, joins
and leaves are tested at each option value.

TEST: Vlan multicast snooping enable                                [ OK ]
TEST: Vlan global options existence                                 [ OK ]
TEST: Vlan mcast_snooping global option default value               [ OK ]
TEST: Vlan 10 multicast snooping control                            [ OK ]

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../selftests/net/forwarding/bridge_vlan_mcast.sh  | 148 +++++++++++++++++++++
 1 file changed, 148 insertions(+)
 create mode 100755 tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh b/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
new file mode 100755
index 000000000000..796e8f094e08
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
@@ -0,0 +1,148 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="vlmc_control_test"
+NUM_NETIFS=4
+CHECK_TC="yes"
+TEST_GROUP="239.10.10.10"
+
+source lib.sh
+
+h1_create()
+{
+	simple_if_init $h1 192.0.2.1/24 2001:db8:1::1/64
+	ip link add l $h1 $h1.10 up type vlan id 10
+}
+
+h1_destroy()
+{
+	ip link del $h1.10
+	simple_if_fini $h1 192.0.2.1/24 2001:db8:1::1/64
+}
+
+h2_create()
+{
+	simple_if_init $h2 192.0.2.2/24 2001:db8:1::2/64
+	ip link add l $h2 $h2.10 up type vlan id 10
+}
+
+h2_destroy()
+{
+	ip link del $h2.10
+	simple_if_fini $h2 192.0.2.2/24 2001:db8:1::2/64
+}
+
+switch_create()
+{
+	ip link add dev br0 type bridge mcast_snooping 1 mcast_querier 1 vlan_filtering 1
+
+	ip link set dev $swp1 master br0
+	ip link set dev $swp2 master br0
+
+	ip link set dev br0 up
+	ip link set dev $swp1 up
+	ip link set dev $swp2 up
+
+	bridge vlan add vid 10-11 dev $swp1 master
+	bridge vlan add vid 10-11 dev $swp2 master
+
+	ip link set dev br0 type bridge mcast_vlan_snooping 1
+	check_err $? "Could not enable global vlan multicast snooping"
+	log_test "Vlan multicast snooping enable"
+}
+
+switch_destroy()
+{
+	ip link set dev $swp2 down
+	ip link set dev $swp1 down
+
+	ip link del dev br0
+}
+
+setup_prepare()
+{
+	h1=${NETIFS[p1]}
+	swp1=${NETIFS[p2]}
+
+	swp2=${NETIFS[p3]}
+	h2=${NETIFS[p4]}
+
+	vrf_prepare
+
+	h1_create
+	h2_create
+
+	switch_create
+}
+
+cleanup()
+{
+	pre_cleanup
+
+	switch_destroy
+
+	h2_destroy
+	h1_destroy
+
+	vrf_cleanup
+}
+
+vlmc_v2join_test()
+{
+	local expect=$1
+
+	RET=0
+	ip address add dev $h2.10 $TEST_GROUP/32 autojoin
+	check_err $? "Could not join $TEST_GROUP"
+
+	sleep 5
+	bridge -j mdb show dev br0 |
+		jq -e ".[].mdb[] | select(.grp == \"$TEST_GROUP\" and .vid == 10)" &>/dev/null
+	if [ $expect -eq 0 ]; then
+		check_err $? "IGMPv2 report didn't create mdb entry for $TEST_GROUP"
+	else
+		check_fail $? "IGMPv2 report shouldn't have created mdb entry for $TEST_GROUP"
+	fi
+
+	# check if we need to cleanup
+	if [ $RET -eq 0 ]; then
+		ip address del dev $h2.10 $TEST_GROUP/32 2>&1 1>/dev/null
+		sleep 5
+		bridge -j mdb show dev br0 |
+			jq -e ".[].mdb[] | select(.grp == \"$TEST_GROUP\" and \
+						  .vid == 10)" &>/dev/null
+		check_fail $? "IGMPv2 leave didn't remove mdb entry for $TEST_GROUP"
+	fi
+}
+
+vlmc_control_test()
+{
+	RET=0
+	local goutput=`bridge -j vlan global show`
+	echo -n $goutput |
+		jq -e ".[].vlans[] | select(.vlan == 10)" &>/dev/null
+	check_err $? "Could not find vlan 10's global options"
+	log_test "Vlan global options existence"
+
+	RET=0
+	echo -n $goutput |
+		jq -e ".[].vlans[] | select(.vlan == 10 and .mcast_snooping == 1) " &>/dev/null
+	check_err $? "Wrong default mcast_snooping global option value"
+	log_test "Vlan mcast_snooping global option default value"
+
+	RET=0
+	vlmc_v2join_test 0
+	bridge vlan global set vid 10 dev br0 mcast_snooping 0
+	check_err $? "Could not disable multicast snooping in vlan 10"
+	vlmc_v2join_test 1
+	log_test "Vlan 10 multicast snooping control"
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
-- 
cgit v1.2.3


From dee2cdc0e3bb8dbdd21f8f441d5a7cac2ddcefe3 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Thu, 25 Nov 2021 16:08:50 +0200
Subject: selftests: net: bridge: add vlan mcast querier test

Add a test to try the new global vlan mcast_querier control and also
verify that tagged general query packets are properly generated when
querier is enabled for a single vlan.

TEST: Vlan mcast_querier global option default value                [ OK ]
TEST: Vlan 10 multicast querier enable                              [ OK ]
TEST: Vlan 10 tagged IGMPv2 general query sent                      [ OK ]
TEST: Vlan 10 tagged MLD general query sent                         [ OK ]

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../selftests/net/forwarding/bridge_vlan_mcast.sh  | 105 ++++++++++++++++++++-
 1 file changed, 104 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh b/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
index 796e8f094e08..aa23764a328b 100755
--- a/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
+++ b/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
-ALL_TESTS="vlmc_control_test"
+ALL_TESTS="vlmc_control_test vlmc_querier_test"
 NUM_NETIFS=4
 CHECK_TC="yes"
 TEST_GROUP="239.10.10.10"
@@ -43,6 +43,9 @@ switch_create()
 	ip link set dev $swp1 up
 	ip link set dev $swp2 up
 
+	tc qdisc add dev $swp1 clsact
+	tc qdisc add dev $swp2 clsact
+
 	bridge vlan add vid 10-11 dev $swp1 master
 	bridge vlan add vid 10-11 dev $swp2 master
 
@@ -138,6 +141,106 @@ vlmc_control_test()
 	log_test "Vlan 10 multicast snooping control"
 }
 
+# setup for general query counting
+vlmc_query_cnt_xstats()
+{
+	local type=$1
+	local version=$2
+	local dev=$3
+
+	ip -j link xstats type bridge_slave dev $dev | \
+	jq -e ".[].multicast.${type}_queries.tx_v${version}"
+}
+
+vlmc_query_cnt_setup()
+{
+	local type=$1
+	local dev=$2
+
+	if [[ $type == "igmp" ]]; then
+		tc filter add dev $dev egress pref 10 prot 802.1Q \
+			flower vlan_id 10 vlan_ethtype ipv4 dst_ip 224.0.0.1 ip_proto 2 \
+			action pass
+	else
+		tc filter add dev $dev egress pref 10 prot 802.1Q \
+			flower vlan_id 10 vlan_ethtype ipv6 dst_ip ff02::1 ip_proto icmpv6 \
+			action pass
+	fi
+
+	ip link set dev br0 type bridge mcast_stats_enabled 1
+}
+
+vlmc_query_cnt_cleanup()
+{
+	local dev=$1
+
+	ip link set dev br0 type bridge mcast_stats_enabled 0
+	tc filter del dev $dev egress pref 10
+}
+
+vlmc_check_query()
+{
+	local type=$1
+	local version=$2
+	local dev=$3
+	local expect=$4
+	local time=$5
+	local ret=0
+
+	vlmc_query_cnt_setup $type $dev
+
+	local pre_tx_xstats=$(vlmc_query_cnt_xstats $type $version $dev)
+	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_querier 1
+	ret=$?
+	if [[ $ret -eq 0 ]]; then
+		sleep $time
+
+		local tcstats=$(tc_rule_stats_get $dev 10 egress)
+		local post_tx_xstats=$(vlmc_query_cnt_xstats $type $version $dev)
+
+		if [[ $tcstats != $expect || \
+		      $(($post_tx_xstats-$pre_tx_xstats)) != $expect || \
+		      $tcstats != $(($post_tx_xstats-$pre_tx_xstats)) ]]; then
+			ret=1
+		fi
+	fi
+
+	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_querier 0
+	vlmc_query_cnt_cleanup $dev
+
+	return $ret
+}
+
+vlmc_querier_test()
+{
+	RET=0
+	local goutput=`bridge -j vlan global show`
+	echo -n $goutput |
+		jq -e ".[].vlans[] | select(.vlan == 10)" &>/dev/null
+	check_err $? "Could not find vlan 10's global options"
+
+	echo -n $goutput |
+		jq -e ".[].vlans[] | select(.vlan == 10 and .mcast_querier == 0) " &>/dev/null
+	check_err $? "Wrong default mcast_querier global vlan option value"
+	log_test "Vlan mcast_querier global option default value"
+
+	RET=0
+	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_querier 1
+	check_err $? "Could not enable querier in vlan 10"
+	log_test "Vlan 10 multicast querier enable"
+	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_querier 0
+
+	RET=0
+	vlmc_check_query igmp 2 $swp1 1 1
+	check_err $? "No vlan tagged IGMPv2 general query packets sent"
+	log_test "Vlan 10 tagged IGMPv2 general query sent"
+
+	RET=0
+	vlmc_check_query mld 1 $swp1 1 1
+	check_err $? "No vlan tagged MLD general query packets sent"
+	log_test "Vlan 10 tagged MLD general query sent"
+}
+
 trap cleanup EXIT
 
 setup_prepare
-- 
cgit v1.2.3


From 2b75e9dd580c33a5618f880dfd81b7cec6576a4b Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Thu, 25 Nov 2021 16:08:51 +0200
Subject: selftests: net: bridge: add vlan mcast igmp/mld version tests

Add tests which change the new per-vlan IGMP/MLD versions and verify
that proper tagged general query packets are sent.

TEST: Vlan mcast_igmp_version global option default value           [ OK ]
TEST: Vlan mcast_mld_version global option default value            [ OK ]
TEST: Vlan 10 mcast_igmp_version option changed to 3                [ OK ]
TEST: Vlan 10 tagged IGMPv3 general query sent                      [ OK ]
TEST: Vlan 10 mcast_mld_version option changed to 2                 [ OK ]
TEST: Vlan 10 tagged MLDv2 general query sent                       [ OK ]

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../selftests/net/forwarding/bridge_vlan_mcast.sh  | 45 +++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh b/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
index aa23764a328b..1b91778fac2f 100755
--- a/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
+++ b/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
-ALL_TESTS="vlmc_control_test vlmc_querier_test"
+ALL_TESTS="vlmc_control_test vlmc_querier_test vlmc_igmp_mld_version_test"
 NUM_NETIFS=4
 CHECK_TC="yes"
 TEST_GROUP="239.10.10.10"
@@ -241,6 +241,49 @@ vlmc_querier_test()
 	log_test "Vlan 10 tagged MLD general query sent"
 }
 
+vlmc_igmp_mld_version_test()
+{
+	RET=0
+	local goutput=`bridge -j vlan global show`
+	echo -n $goutput |
+		jq -e ".[].vlans[] | select(.vlan == 10)" &>/dev/null
+	check_err $? "Could not find vlan 10's global options"
+
+	echo -n $goutput |
+		jq -e ".[].vlans[] | select(.vlan == 10 and .mcast_igmp_version == 2) " &>/dev/null
+	check_err $? "Wrong default mcast_igmp_version global vlan option value"
+	log_test "Vlan mcast_igmp_version global option default value"
+
+	RET=0
+	echo -n $goutput |
+		jq -e ".[].vlans[] | select(.vlan == 10 and .mcast_mld_version == 1) " &>/dev/null
+	check_err $? "Wrong default mcast_mld_version global vlan option value"
+	log_test "Vlan mcast_mld_version global option default value"
+
+	RET=0
+	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_igmp_version 3
+	check_err $? "Could not set mcast_igmp_version in vlan 10"
+	log_test "Vlan 10 mcast_igmp_version option changed to 3"
+
+	RET=0
+	vlmc_check_query igmp 3 $swp1 1 1
+	check_err $? "No vlan tagged IGMPv3 general query packets sent"
+	log_test "Vlan 10 tagged IGMPv3 general query sent"
+
+	RET=0
+	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_mld_version 2
+	check_err $? "Could not set mcast_mld_version in vlan 10"
+	log_test "Vlan 10 mcast_mld_version option changed to 2"
+
+	RET=0
+	vlmc_check_query mld 2 $swp1 1 1
+	check_err $? "No vlan tagged MLDv2 general query packets sent"
+	log_test "Vlan 10 tagged MLDv2 general query sent"
+
+	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_igmp_version 2
+	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_mld_version 1
+}
+
 trap cleanup EXIT
 
 setup_prepare
-- 
cgit v1.2.3


From 3825f1fb675bfc2d75a39202fe8a394607441016 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Thu, 25 Nov 2021 16:08:52 +0200
Subject: selftests: net: bridge: add vlan mcast_last_member_count/interval
 tests

Add tests which verify the default values of mcast_last_member_count
mcast_last_member_count and also try to change them.

TEST: Vlan mcast_last_member_count global option default value      [ OK ]
TEST: Vlan mcast_last_member_interval global option default value   [ OK ]
TEST: Vlan 10 mcast_last_member_count option changed to 3           [ OK ]
TEST: Vlan 10 mcast_last_member_interval option changed to 200      [ OK ]

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../selftests/net/forwarding/bridge_vlan_mcast.sh  | 36 +++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh b/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
index 1b91778fac2f..85146c998316 100755
--- a/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
+++ b/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
-ALL_TESTS="vlmc_control_test vlmc_querier_test vlmc_igmp_mld_version_test"
+ALL_TESTS="vlmc_control_test vlmc_querier_test vlmc_igmp_mld_version_test vlmc_last_member_test"
 NUM_NETIFS=4
 CHECK_TC="yes"
 TEST_GROUP="239.10.10.10"
@@ -284,6 +284,40 @@ vlmc_igmp_mld_version_test()
 	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_mld_version 1
 }
 
+vlmc_last_member_test()
+{
+	RET=0
+	local goutput=`bridge -j vlan global show`
+	echo -n $goutput |
+		jq -e ".[].vlans[] | select(.vlan == 10)" &>/dev/null
+	check_err $? "Could not find vlan 10's global options"
+
+	echo -n $goutput |
+		jq -e ".[].vlans[] | select(.vlan == 10 and \
+					    .mcast_last_member_count == 2) " &>/dev/null
+	check_err $? "Wrong default mcast_last_member_count global vlan option value"
+	log_test "Vlan mcast_last_member_count global option default value"
+
+	RET=0
+	echo -n $goutput |
+		jq -e ".[].vlans[] | select(.vlan == 10 and \
+					    .mcast_last_member_interval == 100) " &>/dev/null
+	check_err $? "Wrong default mcast_last_member_interval global vlan option value"
+	log_test "Vlan mcast_last_member_interval global option default value"
+
+	RET=0
+	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_last_member_count 3
+	check_err $? "Could not set mcast_last_member_count in vlan 10"
+	log_test "Vlan 10 mcast_last_member_count option changed to 3"
+	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_last_member_count 2
+
+	RET=0
+	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_last_member_interval 200
+	check_err $? "Could not set mcast_last_member_interval in vlan 10"
+	log_test "Vlan 10 mcast_last_member_interval option changed to 200"
+	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_last_member_interval 100
+}
+
 trap cleanup EXIT
 
 setup_prepare
-- 
cgit v1.2.3


From bdf1b2c05e09b239f553c0ee828e2f851f4a1b47 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Thu, 25 Nov 2021 16:08:53 +0200
Subject: selftests: net: bridge: add vlan mcast_startup_query_count/interval
 tests

Add tests which change the new per-vlan startup query count/interval
options and verify the proper number of queries are sent in the expected
interval.

TEST: Vlan mcast_startup_query_interval global option default value   [ OK ]
TEST: Vlan mcast_startup_query_count global option default value    [ OK ]
TEST: Vlan 10 mcast_startup_query_interval option changed to 100    [ OK ]
TEST: Vlan 10 mcast_startup_query_count option changed to 3         [ OK ]

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../selftests/net/forwarding/bridge_vlan_mcast.sh  | 42 +++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh b/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
index 85146c998316..0070d8292569 100755
--- a/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
+++ b/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
@@ -1,7 +1,8 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
-ALL_TESTS="vlmc_control_test vlmc_querier_test vlmc_igmp_mld_version_test vlmc_last_member_test"
+ALL_TESTS="vlmc_control_test vlmc_querier_test vlmc_igmp_mld_version_test \
+	   vlmc_last_member_test vlmc_startup_query_test"
 NUM_NETIFS=4
 CHECK_TC="yes"
 TEST_GROUP="239.10.10.10"
@@ -318,6 +319,45 @@ vlmc_last_member_test()
 	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_last_member_interval 100
 }
 
+vlmc_startup_query_test()
+{
+	RET=0
+	local goutput=`bridge -j vlan global show`
+	echo -n $goutput |
+		jq -e ".[].vlans[] | select(.vlan == 10)" &>/dev/null
+	check_err $? "Could not find vlan 10's global options"
+
+	echo -n $goutput |
+		jq -e ".[].vlans[] | select(.vlan == 10 and \
+					    .mcast_startup_query_interval == 3125) " &>/dev/null
+	check_err $? "Wrong default mcast_startup_query_interval global vlan option value"
+	log_test "Vlan mcast_startup_query_interval global option default value"
+
+	RET=0
+	echo -n $goutput |
+		jq -e ".[].vlans[] | select(.vlan == 10 and \
+					    .mcast_startup_query_count == 2) " &>/dev/null
+	check_err $? "Wrong default mcast_startup_query_count global vlan option value"
+	log_test "Vlan mcast_startup_query_count global option default value"
+
+	RET=0
+	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_startup_query_interval 100
+	check_err $? "Could not set mcast_startup_query_interval in vlan 10"
+	vlmc_check_query igmp 2 $swp1 2 3
+	check_err $? "Wrong number of tagged IGMPv2 general queries sent"
+	log_test "Vlan 10 mcast_startup_query_interval option changed to 100"
+
+	RET=0
+	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_startup_query_count 3
+	check_err $? "Could not set mcast_startup_query_count in vlan 10"
+	vlmc_check_query igmp 2 $swp1 3 4
+	check_err $? "Wrong number of tagged IGMPv2 general queries sent"
+	log_test "Vlan 10 mcast_startup_query_count option changed to 3"
+
+	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_startup_query_interval 3125
+	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_startup_query_count 2
+}
+
 trap cleanup EXIT
 
 setup_prepare
-- 
cgit v1.2.3


From a45fe974173668c1cbd1e23eb9fabdb5b17ae0be Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Thu, 25 Nov 2021 16:08:54 +0200
Subject: selftests: net: bridge: add vlan mcast_membership_interval test

Add a test which changes the new per-vlan mcast_membership_interval and
verifies that a newly learned mdb entry would expire in that interval.

TEST: Vlan mcast_membership_interval global option default value    [ OK ]
TEST: Vlan 10 mcast_membership_interval option changed to 200       [ OK ]
TEST: Vlan 10 mcast_membership_interval mdb entry expire            [ OK ]

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../selftests/net/forwarding/bridge_vlan_mcast.sh  | 28 +++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh b/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
index 0070d8292569..87dcd49b0a8d 100755
--- a/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
+++ b/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: GPL-2.0
 
 ALL_TESTS="vlmc_control_test vlmc_querier_test vlmc_igmp_mld_version_test \
-	   vlmc_last_member_test vlmc_startup_query_test"
+	   vlmc_last_member_test vlmc_startup_query_test vlmc_membership_test"
 NUM_NETIFS=4
 CHECK_TC="yes"
 TEST_GROUP="239.10.10.10"
@@ -358,6 +358,32 @@ vlmc_startup_query_test()
 	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_startup_query_count 2
 }
 
+vlmc_membership_test()
+{
+	RET=0
+	local goutput=`bridge -j vlan global show`
+	echo -n $goutput |
+		jq -e ".[].vlans[] | select(.vlan == 10)" &>/dev/null
+	check_err $? "Could not find vlan 10's global options"
+
+	echo -n $goutput |
+		jq -e ".[].vlans[] | select(.vlan == 10 and \
+					    .mcast_membership_interval == 26000) " &>/dev/null
+	check_err $? "Wrong default mcast_membership_interval global vlan option value"
+	log_test "Vlan mcast_membership_interval global option default value"
+
+	RET=0
+	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_membership_interval 200
+	check_err $? "Could not set mcast_membership_interval in vlan 10"
+	log_test "Vlan 10 mcast_membership_interval option changed to 200"
+
+	RET=0
+	vlmc_v2join_test 1
+	log_test "Vlan 10 mcast_membership_interval mdb entry expire"
+
+	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_membership_interval 26000
+}
+
 trap cleanup EXIT
 
 setup_prepare
-- 
cgit v1.2.3


From 4d8610ee8bd77727dec7833065d2b2ae63366e49 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Thu, 25 Nov 2021 16:08:55 +0200
Subject: selftests: net: bridge: add vlan mcast_querier_interval tests

Add tests which change the new per-vlan mcast_querier_interval and
verify that it is taken into account when an outside querier is present.

TEST: Vlan mcast_querier_interval global option default value       [ OK ]
TEST: Vlan 10 mcast_querier_interval option changed to 100          [ OK ]
TEST: Vlan 10 mcast_querier_interval expire after outside query     [ OK ]

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../selftests/net/forwarding/bridge_vlan_mcast.sh  | 40 +++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh b/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
index 87dcd49b0a8d..9a68d56fd74c 100755
--- a/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
+++ b/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
@@ -2,7 +2,8 @@
 # SPDX-License-Identifier: GPL-2.0
 
 ALL_TESTS="vlmc_control_test vlmc_querier_test vlmc_igmp_mld_version_test \
-	   vlmc_last_member_test vlmc_startup_query_test vlmc_membership_test"
+	   vlmc_last_member_test vlmc_startup_query_test vlmc_membership_test \
+	   vlmc_querier_intvl_test"
 NUM_NETIFS=4
 CHECK_TC="yes"
 TEST_GROUP="239.10.10.10"
@@ -384,6 +385,43 @@ vlmc_membership_test()
 	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_membership_interval 26000
 }
 
+vlmc_querier_intvl_test()
+{
+	RET=0
+	local goutput=`bridge -j vlan global show`
+	echo -n $goutput |
+		jq -e ".[].vlans[] | select(.vlan == 10)" &>/dev/null
+	check_err $? "Could not find vlan 10's global options"
+
+	echo -n $goutput |
+		jq -e ".[].vlans[] | select(.vlan == 10 and \
+					    .mcast_querier_interval == 25500) " &>/dev/null
+	check_err $? "Wrong default mcast_querier_interval global vlan option value"
+	log_test "Vlan mcast_querier_interval global option default value"
+
+	RET=0
+	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_querier_interval 100
+	check_err $? "Could not set mcast_querier_interval in vlan 10"
+	log_test "Vlan 10 mcast_querier_interval option changed to 100"
+
+	RET=0
+	ip link add dev br1 type bridge mcast_snooping 1 mcast_querier 1 vlan_filtering 1 \
+					mcast_vlan_snooping 1
+	bridge vlan add vid 10 dev br1 self pvid untagged
+	ip link set dev $h1 master br1
+	ip link set dev br1 up
+	bridge vlan add vid 10 dev $h1 master
+	bridge vlan global set vid 10 dev br1 mcast_snooping 1 mcast_querier 1
+	sleep 2
+	ip link del dev br1
+	ip addr replace 2001:db8:1::1/64 dev $h1
+	vlmc_check_query igmp 2 $swp1 1 1
+	check_err $? "Wrong number of IGMPv2 general queries after querier interval"
+	log_test "Vlan 10 mcast_querier_interval expire after outside query"
+
+	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_querier_interval 25500
+}
+
 trap cleanup EXIT
 
 setup_prepare
-- 
cgit v1.2.3


From b4ce7b9523c407cdd657c3bc20dcb6eb6701f59f Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Thu, 25 Nov 2021 16:08:56 +0200
Subject: selftests: net: bridge: add vlan mcast query and query response
 interval tests

Add tests which change the new per-vlan mcast_query_interval and verify
the new value is in effect, also add a test to change
mcast_query_response_interval's value.

TEST: Vlan mcast_query_interval global option default value         [ OK ]
TEST: Vlan 10 mcast_query_interval option changed to 200            [ OK ]
TEST: Vlan mcast_query_response_interval global option default value   [ OK ]
TEST: Vlan 10 mcast_query_response_interval option changed to 200   [ OK ]

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../selftests/net/forwarding/bridge_vlan_mcast.sh  | 51 +++++++++++++++++++++-
 1 file changed, 50 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh b/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
index 9a68d56fd74c..fbc7f5045b26 100755
--- a/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
+++ b/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
@@ -3,7 +3,7 @@
 
 ALL_TESTS="vlmc_control_test vlmc_querier_test vlmc_igmp_mld_version_test \
 	   vlmc_last_member_test vlmc_startup_query_test vlmc_membership_test \
-	   vlmc_querier_intvl_test"
+	   vlmc_querier_intvl_test vlmc_query_intvl_test vlmc_query_response_intvl_test"
 NUM_NETIFS=4
 CHECK_TC="yes"
 TEST_GROUP="239.10.10.10"
@@ -422,6 +422,55 @@ vlmc_querier_intvl_test()
 	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_querier_interval 25500
 }
 
+vlmc_query_intvl_test()
+{
+	RET=0
+	local goutput=`bridge -j vlan global show`
+	echo -n $goutput |
+		jq -e ".[].vlans[] | select(.vlan == 10)" &>/dev/null
+	check_err $? "Could not find vlan 10's global options"
+
+	echo -n $goutput |
+		jq -e ".[].vlans[] | select(.vlan == 10 and \
+					    .mcast_query_interval == 12500) " &>/dev/null
+	check_err $? "Wrong default mcast_query_interval global vlan option value"
+	log_test "Vlan mcast_query_interval global option default value"
+
+	RET=0
+	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_startup_query_count 0
+	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_query_interval 200
+	check_err $? "Could not set mcast_query_interval in vlan 10"
+	# 1 is sent immediately, then 2 more in the next 5 seconds
+	vlmc_check_query igmp 2 $swp1 3 5
+	check_err $? "Wrong number of tagged IGMPv2 general queries sent"
+	log_test "Vlan 10 mcast_query_interval option changed to 200"
+
+	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_startup_query_count 2
+	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_query_interval 12500
+}
+
+vlmc_query_response_intvl_test()
+{
+	RET=0
+	local goutput=`bridge -j vlan global show`
+	echo -n $goutput |
+		jq -e ".[].vlans[] | select(.vlan == 10)" &>/dev/null
+	check_err $? "Could not find vlan 10's global options"
+
+	echo -n $goutput |
+		jq -e ".[].vlans[] | select(.vlan == 10 and \
+					    .mcast_query_response_interval == 1000) " &>/dev/null
+	check_err $? "Wrong default mcast_query_response_interval global vlan option value"
+	log_test "Vlan mcast_query_response_interval global option default value"
+
+	RET=0
+	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_query_response_interval 200
+	check_err $? "Could not set mcast_query_response_interval in vlan 10"
+	log_test "Vlan 10 mcast_query_response_interval option changed to 200"
+
+	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_query_response_interval 1000
+}
+
 trap cleanup EXIT
 
 setup_prepare
-- 
cgit v1.2.3


From 2cd67a4e278e2bf95dac5a0889d55b17b37f3de7 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Thu, 25 Nov 2021 16:08:57 +0200
Subject: selftests: net: bridge: add vlan mcast_router tests

Add tests for the new per-port/vlan mcast_router option, verify that
unknown multicast packets are flooded only to router ports.

TEST: Port vlan 10 option mcast_router default value                [ OK ]
TEST: Port vlan 10 mcast_router option changed to 2                 [ OK ]
TEST: Flood unknown vlan multicast packets to router port only      [ OK ]

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../selftests/net/forwarding/bridge_vlan_mcast.sh  | 54 +++++++++++++++++++++-
 1 file changed, 53 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh b/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
index fbc7f5045b26..898a70f4d226 100755
--- a/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
+++ b/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
@@ -3,7 +3,8 @@
 
 ALL_TESTS="vlmc_control_test vlmc_querier_test vlmc_igmp_mld_version_test \
 	   vlmc_last_member_test vlmc_startup_query_test vlmc_membership_test \
-	   vlmc_querier_intvl_test vlmc_query_intvl_test vlmc_query_response_intvl_test"
+	   vlmc_querier_intvl_test vlmc_query_intvl_test vlmc_query_response_intvl_test \
+	   vlmc_router_port_test"
 NUM_NETIFS=4
 CHECK_TC="yes"
 TEST_GROUP="239.10.10.10"
@@ -471,6 +472,57 @@ vlmc_query_response_intvl_test()
 	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_query_response_interval 1000
 }
 
+vlmc_router_port_test()
+{
+	RET=0
+	local goutput=`bridge -j -d vlan show`
+	echo -n $goutput |
+		jq -e ".[] | select(.ifname == \"$swp1\" and \
+				    .vlans[].vlan == 10)" &>/dev/null
+	check_err $? "Could not find port vlan 10's options"
+
+	echo -n $goutput |
+		jq -e ".[] | select(.ifname == \"$swp1\" and \
+				    .vlans[].vlan == 10 and \
+				    .vlans[].mcast_router == 1)" &>/dev/null
+	check_err $? "Wrong default port mcast_router option value"
+	log_test "Port vlan 10 option mcast_router default value"
+
+	RET=0
+	bridge vlan set vid 10 dev $swp1 mcast_router 2
+	check_err $? "Could not set port vlan 10's mcast_router option"
+	log_test "Port vlan 10 mcast_router option changed to 2"
+
+	RET=0
+	tc filter add dev $swp1 egress pref 10 prot 802.1Q \
+		flower vlan_id 10 vlan_ethtype ipv4 dst_ip 239.1.1.1 ip_proto udp action pass
+	tc filter add dev $swp2 egress pref 10 prot 802.1Q \
+		flower vlan_id 10 vlan_ethtype ipv4 dst_ip 239.1.1.1 ip_proto udp action pass
+	bridge vlan set vid 10 dev $swp2 mcast_router 0
+	# we need to enable querier and disable query response interval to
+	# make sure packets are flooded only to router ports
+	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_querier 1 \
+					      mcast_query_response_interval 0
+	bridge vlan add vid 10 dev br0 self
+	sleep 1
+	mausezahn br0 -Q 10 -c 10 -p 128 -b 01:00:5e:01:01:01 -B 239.1.1.1 \
+			-t udp "dp=1024" &>/dev/null
+	local swp1_tcstats=$(tc_rule_stats_get $swp1 10 egress)
+	if [[ $swp1_tcstats != 10 ]]; then
+		check_err 1 "Wrong number of vlan 10 multicast packets flooded"
+	fi
+	local swp2_tcstats=$(tc_rule_stats_get $swp2 10 egress)
+	check_err $swp2_tcstats "Vlan 10 multicast packets flooded to non-router port"
+	log_test "Flood unknown vlan multicast packets to router port only"
+
+	tc filter del dev $swp2 egress pref 10
+	tc filter del dev $swp1 egress pref 10
+	bridge vlan del vid 10 dev br0 self
+	bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_query_response_interval 1000
+	bridge vlan set vid 10 dev $swp2 mcast_router 1
+	bridge vlan set vid 10 dev $swp1 mcast_router 1
+}
+
 trap cleanup EXIT
 
 setup_prepare
-- 
cgit v1.2.3


From f5a9dd58f48b7c26da42387d6d38889fce8a2778 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Thu, 25 Nov 2021 16:08:58 +0200
Subject: selftests: net: bridge: add test for vlan_filtering dependency

Add a test for dependency of mcast_vlan_snooping on vlan_filtering. If
vlan_filtering gets disabled, then mcast_vlan_snooping must be
automatically disabled as well.

TEST: Disable multicast vlan snooping when vlan filtering is disabled   [ OK ]

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh b/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
index 898a70f4d226..5224a5a8595b 100755
--- a/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
+++ b/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
@@ -4,7 +4,7 @@
 ALL_TESTS="vlmc_control_test vlmc_querier_test vlmc_igmp_mld_version_test \
 	   vlmc_last_member_test vlmc_startup_query_test vlmc_membership_test \
 	   vlmc_querier_intvl_test vlmc_query_intvl_test vlmc_query_response_intvl_test \
-	   vlmc_router_port_test"
+	   vlmc_router_port_test vlmc_filtering_test"
 NUM_NETIFS=4
 CHECK_TC="yes"
 TEST_GROUP="239.10.10.10"
@@ -523,6 +523,16 @@ vlmc_router_port_test()
 	bridge vlan set vid 10 dev $swp1 mcast_router 1
 }
 
+vlmc_filtering_test()
+{
+	RET=0
+	ip link set dev br0 type bridge vlan_filtering 0
+	ip -j -d link show dev bridge | \
+	jq -e "select(.[0].linkinfo.info_data.mcast_vlan_snooping == 1)" &>/dev/null
+	check_fail $? "Vlan filtering is disabled but multicast vlan snooping is still enabled"
+	log_test "Disable multicast vlan snooping when vlan filtering is disabled"
+}
+
 trap cleanup EXIT
 
 setup_prepare
-- 
cgit v1.2.3


From 5ce7ab4961a9320ca0836e06849210d088723a56 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Wed, 24 Nov 2021 11:14:27 +0900
Subject: af_unix: Remove UNIX_ABSTRACT() macro and test sun_path[0] instead.

In BSD and abstract address cases, we store sockets in the hash table with
keys between 0 and UNIX_HASH_SIZE - 1.  However, the hash saved in a socket
varies depending on its address type; sockets with BSD addresses always
have UNIX_HASH_SIZE in their unix_sk(sk)->addr->hash.

This is just for the UNIX_ABSTRACT() macro used to check the address type.
The difference of the saved hashes comes from the first byte of the address
in the first place.  So, we can test it directly.

Then we can keep a real hash in each socket and replace unix_table_lock
with per-hash locks in the later patch.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/unix/af_unix.c                                        | 6 ++----
 tools/testing/selftests/bpf/progs/bpf_iter_unix.c         | 2 +-
 tools/testing/selftests/bpf/progs/bpf_tracing_net.h       | 2 --
 tools/testing/selftests/bpf/progs/test_skc_to_unix_sock.c | 2 +-
 4 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 2710e7dfc638..80752ab9ed97 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -134,8 +134,6 @@ static struct hlist_head *unix_sockets_unbound(void *addr)
 	return &unix_socket_table[UNIX_HASH_SIZE + hash];
 }
 
-#define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
-
 #ifdef CONFIG_SECURITY_NETWORK
 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 {
@@ -3292,9 +3290,9 @@ static int unix_seq_show(struct seq_file *seq, void *v)
 			i = 0;
 			len = u->addr->len -
 				offsetof(struct sockaddr_un, sun_path);
-			if (!UNIX_ABSTRACT(s))
+			if (u->addr->name->sun_path[0]) {
 				len--;
-			else {
+			} else {
 				seq_putc(seq, '@');
 				i++;
 			}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_unix.c b/tools/testing/selftests/bpf/progs/bpf_iter_unix.c
index 94423902685d..c21e3f545371 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_unix.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_unix.c
@@ -49,7 +49,7 @@ int dump_unix(struct bpf_iter__unix *ctx)
 		       sock_i_ino(sk));
 
 	if (unix_sk->addr) {
-		if (!UNIX_ABSTRACT(unix_sk)) {
+		if (unix_sk->addr->name->sun_path[0]) {
 			BPF_SEQ_PRINTF(seq, " %s", unix_sk->addr->name->sun_path);
 		} else {
 			/* The name of the abstract UNIX domain socket starts
diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
index eef5646ddb19..e0f42601be9b 100644
--- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
+++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
@@ -6,8 +6,6 @@
 #define AF_INET6		10
 
 #define __SO_ACCEPTCON		(1 << 16)
-#define UNIX_HASH_SIZE		256
-#define UNIX_ABSTRACT(unix_sk)	(unix_sk->addr->hash < UNIX_HASH_SIZE)
 
 #define SOL_TCP			6
 #define TCP_CONGESTION		13
diff --git a/tools/testing/selftests/bpf/progs/test_skc_to_unix_sock.c b/tools/testing/selftests/bpf/progs/test_skc_to_unix_sock.c
index a408ec95cba4..eacda9fe07eb 100644
--- a/tools/testing/selftests/bpf/progs/test_skc_to_unix_sock.c
+++ b/tools/testing/selftests/bpf/progs/test_skc_to_unix_sock.c
@@ -23,7 +23,7 @@ int BPF_PROG(unix_listen, struct socket *sock, int backlog)
 	if (!unix_sk)
 		return 0;
 
-	if (!UNIX_ABSTRACT(unix_sk))
+	if (unix_sk->addr->name->sun_path[0])
 		return 0;
 
 	len = unix_sk->addr->len - sizeof(short);
-- 
cgit v1.2.3


From 341ac5ffc4bd859103899c876902caf07cc97ea4 Mon Sep 17 00:00:00 2001
From: Hengqi Chen <hengqi.chen@gmail.com>
Date: Sun, 28 Nov 2021 22:16:32 +0800
Subject: libbpf: Support static initialization of BPF_MAP_TYPE_PROG_ARRAY

Support static initialization of BPF_MAP_TYPE_PROG_ARRAY with a
syntax similar to map-in-map initialization ([0]):

    SEC("socket")
    int tailcall_1(void *ctx)
    {
        return 0;
    }

    struct {
        __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
        __uint(max_entries, 2);
        __uint(key_size, sizeof(__u32));
        __array(values, int (void *));
    } prog_array_init SEC(".maps") = {
        .values = {
            [1] = (void *)&tailcall_1,
        },
    };

Here's the relevant part of libbpf debug log showing what's
going on with prog-array initialization:

libbpf: sec '.relsocket': collecting relocation for section(3) 'socket'
libbpf: sec '.relsocket': relo #0: insn #2 against 'prog_array_init'
libbpf: prog 'entry': found map 0 (prog_array_init, sec 4, off 0) for insn #0
libbpf: .maps relo #0: for 3 value 0 rel->r_offset 32 name 53 ('tailcall_1')
libbpf: .maps relo #0: map 'prog_array_init' slot [1] points to prog 'tailcall_1'
libbpf: map 'prog_array_init': created successfully, fd=5
libbpf: map 'prog_array_init': slot [1] set to prog 'tailcall_1' fd=6

  [0] Closes: https://github.com/libbpf/libbpf/issues/354

Signed-off-by: Hengqi Chen <hengqi.chen@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211128141633.502339-2-hengqi.chen@gmail.com
---
 tools/lib/bpf/libbpf.c | 154 ++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 121 insertions(+), 33 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 672671879b21..b59fede08ba7 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -2277,6 +2277,9 @@ int parse_btf_map_def(const char *map_name, struct btf *btf,
 			map_def->parts |= MAP_DEF_VALUE_SIZE | MAP_DEF_VALUE_TYPE;
 		}
 		else if (strcmp(name, "values") == 0) {
+			bool is_map_in_map = bpf_map_type__is_map_in_map(map_def->map_type);
+			bool is_prog_array = map_def->map_type == BPF_MAP_TYPE_PROG_ARRAY;
+			const char *desc = is_map_in_map ? "map-in-map inner" : "prog-array value";
 			char inner_map_name[128];
 			int err;
 
@@ -2290,8 +2293,8 @@ int parse_btf_map_def(const char *map_name, struct btf *btf,
 					map_name, name);
 				return -EINVAL;
 			}
-			if (!bpf_map_type__is_map_in_map(map_def->map_type)) {
-				pr_warn("map '%s': should be map-in-map.\n",
+			if (!is_map_in_map && !is_prog_array) {
+				pr_warn("map '%s': should be map-in-map or prog-array.\n",
 					map_name);
 				return -ENOTSUP;
 			}
@@ -2303,22 +2306,30 @@ int parse_btf_map_def(const char *map_name, struct btf *btf,
 			map_def->value_size = 4;
 			t = btf__type_by_id(btf, m->type);
 			if (!t) {
-				pr_warn("map '%s': map-in-map inner type [%d] not found.\n",
-					map_name, m->type);
+				pr_warn("map '%s': %s type [%d] not found.\n",
+					map_name, desc, m->type);
 				return -EINVAL;
 			}
 			if (!btf_is_array(t) || btf_array(t)->nelems) {
-				pr_warn("map '%s': map-in-map inner spec is not a zero-sized array.\n",
-					map_name);
+				pr_warn("map '%s': %s spec is not a zero-sized array.\n",
+					map_name, desc);
 				return -EINVAL;
 			}
 			t = skip_mods_and_typedefs(btf, btf_array(t)->type, NULL);
 			if (!btf_is_ptr(t)) {
-				pr_warn("map '%s': map-in-map inner def is of unexpected kind %s.\n",
-					map_name, btf_kind_str(t));
+				pr_warn("map '%s': %s def is of unexpected kind %s.\n",
+					map_name, desc, btf_kind_str(t));
 				return -EINVAL;
 			}
 			t = skip_mods_and_typedefs(btf, t->type, NULL);
+			if (is_prog_array) {
+				if (!btf_is_func_proto(t)) {
+					pr_warn("map '%s': prog-array value def is of unexpected kind %s.\n",
+						map_name, btf_kind_str(t));
+					return -EINVAL;
+				}
+				continue;
+			}
 			if (!btf_is_struct(t)) {
 				pr_warn("map '%s': map-in-map inner def is of unexpected kind %s.\n",
 					map_name, btf_kind_str(t));
@@ -4940,7 +4951,7 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b
 	return err;
 }
 
-static int init_map_slots(struct bpf_object *obj, struct bpf_map *map)
+static int init_map_in_map_slots(struct bpf_object *obj, struct bpf_map *map)
 {
 	const struct bpf_map *targ_map;
 	unsigned int i;
@@ -4952,6 +4963,7 @@ static int init_map_slots(struct bpf_object *obj, struct bpf_map *map)
 
 		targ_map = map->init_slots[i];
 		fd = bpf_map__fd(targ_map);
+
 		if (obj->gen_loader) {
 			pr_warn("// TODO map_update_elem: idx %td key %d value==map_idx %td\n",
 				map - obj->maps, i, targ_map - obj->maps);
@@ -4962,8 +4974,7 @@ static int init_map_slots(struct bpf_object *obj, struct bpf_map *map)
 		if (err) {
 			err = -errno;
 			pr_warn("map '%s': failed to initialize slot [%d] to map '%s' fd=%d: %d\n",
-				map->name, i, targ_map->name,
-				fd, err);
+				map->name, i, targ_map->name, fd, err);
 			return err;
 		}
 		pr_debug("map '%s': slot [%d] set to map '%s' fd=%d\n",
@@ -4976,6 +4987,59 @@ static int init_map_slots(struct bpf_object *obj, struct bpf_map *map)
 	return 0;
 }
 
+static int init_prog_array_slots(struct bpf_object *obj, struct bpf_map *map)
+{
+	const struct bpf_program *targ_prog;
+	unsigned int i;
+	int fd, err;
+
+	if (obj->gen_loader)
+		return -ENOTSUP;
+
+	for (i = 0; i < map->init_slots_sz; i++) {
+		if (!map->init_slots[i])
+			continue;
+
+		targ_prog = map->init_slots[i];
+		fd = bpf_program__fd(targ_prog);
+
+		err = bpf_map_update_elem(map->fd, &i, &fd, 0);
+		if (err) {
+			err = -errno;
+			pr_warn("map '%s': failed to initialize slot [%d] to prog '%s' fd=%d: %d\n",
+				map->name, i, targ_prog->name, fd, err);
+			return err;
+		}
+		pr_debug("map '%s': slot [%d] set to prog '%s' fd=%d\n",
+			 map->name, i, targ_prog->name, fd);
+	}
+
+	zfree(&map->init_slots);
+	map->init_slots_sz = 0;
+
+	return 0;
+}
+
+static int bpf_object_init_prog_arrays(struct bpf_object *obj)
+{
+	struct bpf_map *map;
+	int i, err;
+
+	for (i = 0; i < obj->nr_maps; i++) {
+		map = &obj->maps[i];
+
+		if (!map->init_slots_sz || map->def.type != BPF_MAP_TYPE_PROG_ARRAY)
+			continue;
+
+		err = init_prog_array_slots(obj, map);
+		if (err < 0) {
+			zclose(map->fd);
+			return err;
+		}
+	}
+	return 0;
+}
+
 static int
 bpf_object__create_maps(struct bpf_object *obj)
 {
@@ -5042,8 +5106,8 @@ retry:
 				}
 			}
 
-			if (map->init_slots_sz) {
-				err = init_map_slots(obj, map);
+			if (map->init_slots_sz && map->def.type != BPF_MAP_TYPE_PROG_ARRAY) {
+				err = init_map_in_map_slots(obj, map);
 				if (err < 0) {
 					zclose(map->fd);
 					goto err_out;
@@ -6189,9 +6253,11 @@ static int bpf_object__collect_map_relos(struct bpf_object *obj,
 	int i, j, nrels, new_sz;
 	const struct btf_var_secinfo *vi = NULL;
 	const struct btf_type *sec, *var, *def;
-	struct bpf_map *map = NULL, *targ_map;
+	struct bpf_map *map = NULL, *targ_map = NULL;
+	struct bpf_program *targ_prog = NULL;
+	bool is_prog_array, is_map_in_map;
 	const struct btf_member *member;
-	const char *name, *mname;
+	const char *name, *mname, *type;
 	unsigned int moff;
 	Elf64_Sym *sym;
 	Elf64_Rel *rel;
@@ -6218,11 +6284,6 @@ static int bpf_object__collect_map_relos(struct bpf_object *obj,
 			return -LIBBPF_ERRNO__FORMAT;
 		}
 		name = elf_sym_str(obj, sym->st_name) ?: "<?>";
-		if (sym->st_shndx != obj->efile.btf_maps_shndx) {
-			pr_warn(".maps relo #%d: '%s' isn't a BTF-defined map\n",
-				i, name);
-			return -LIBBPF_ERRNO__RELOC;
-		}
 
 		pr_debug(".maps relo #%d: for %zd value %zd rel->r_offset %zu name %d ('%s')\n",
 			 i, (ssize_t)(rel->r_info >> 32), (size_t)sym->st_value,
@@ -6244,19 +6305,45 @@ static int bpf_object__collect_map_relos(struct bpf_object *obj,
 			return -EINVAL;
 		}
 
-		if (!bpf_map_type__is_map_in_map(map->def.type))
-			return -EINVAL;
-		if (map->def.type == BPF_MAP_TYPE_HASH_OF_MAPS &&
-		    map->def.key_size != sizeof(int)) {
-			pr_warn(".maps relo #%d: hash-of-maps '%s' should have key size %zu.\n",
-				i, map->name, sizeof(int));
+		is_map_in_map = bpf_map_type__is_map_in_map(map->def.type);
+		is_prog_array = map->def.type == BPF_MAP_TYPE_PROG_ARRAY;
+		type = is_map_in_map ? "map" : "prog";
+		if (is_map_in_map) {
+			if (sym->st_shndx != obj->efile.btf_maps_shndx) {
+				pr_warn(".maps relo #%d: '%s' isn't a BTF-defined map\n",
+					i, name);
+				return -LIBBPF_ERRNO__RELOC;
+			}
+			if (map->def.type == BPF_MAP_TYPE_HASH_OF_MAPS &&
+			    map->def.key_size != sizeof(int)) {
+				pr_warn(".maps relo #%d: hash-of-maps '%s' should have key size %zu.\n",
+					i, map->name, sizeof(int));
+				return -EINVAL;
+			}
+			targ_map = bpf_object__find_map_by_name(obj, name);
+			if (!targ_map) {
+				pr_warn(".maps relo #%d: '%s' isn't a valid map reference\n",
+					i, name);
+				return -ESRCH;
+			}
+		} else if (is_prog_array) {
+			targ_prog = bpf_object__find_program_by_name(obj, name);
+			if (!targ_prog) {
+				pr_warn(".maps relo #%d: '%s' isn't a valid program reference\n",
+					i, name);
+				return -ESRCH;
+			}
+			if (targ_prog->sec_idx != sym->st_shndx ||
+			    targ_prog->sec_insn_off * 8 != sym->st_value ||
+			    prog_is_subprog(obj, targ_prog)) {
+				pr_warn(".maps relo #%d: '%s' isn't an entry-point program\n",
+					i, name);
+				return -LIBBPF_ERRNO__RELOC;
+			}
+		} else {
 			return -EINVAL;
 		}
 
-		targ_map = bpf_object__find_map_by_name(obj, name);
-		if (!targ_map)
-			return -ESRCH;
-
 		var = btf__type_by_id(obj->btf, vi->type);
 		def = skip_mods_and_typedefs(obj->btf, var->type, NULL);
 		if (btf_vlen(def) == 0)
@@ -6287,10 +6374,10 @@ static int bpf_object__collect_map_relos(struct bpf_object *obj,
 			       (new_sz - map->init_slots_sz) * host_ptr_sz);
 			map->init_slots_sz = new_sz;
 		}
-		map->init_slots[moff] = targ_map;
+		map->init_slots[moff] = is_map_in_map ? (void *)targ_map : (void *)targ_prog;
 
-		pr_debug(".maps relo #%d: map '%s' slot [%d] points to map '%s'\n",
-			 i, map->name, moff, name);
+		pr_debug(".maps relo #%d: map '%s' slot [%d] points to %s '%s'\n",
+			 i, map->name, moff, type, name);
 	}
 
 	return 0;
@@ -7304,6 +7391,7 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr)
 	err = err ? : bpf_object__create_maps(obj);
 	err = err ? : bpf_object__relocate(obj, obj->btf_custom_path ? : attr->target_btf_path);
 	err = err ? : bpf_object__load_progs(obj, attr->log_level);
+	err = err ? : bpf_object_init_prog_arrays(obj);
 
 	if (obj->gen_loader) {
 		/* reset FDs */
-- 
cgit v1.2.3


From baeead213e67a9554d589a2845c634b8e473d107 Mon Sep 17 00:00:00 2001
From: Hengqi Chen <hengqi.chen@gmail.com>
Date: Sun, 28 Nov 2021 22:16:33 +0800
Subject: selftests/bpf: Test BPF_MAP_TYPE_PROG_ARRAY static initialization

Add testcase for BPF_MAP_TYPE_PROG_ARRAY static initialization.

Signed-off-by: Hengqi Chen <hengqi.chen@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211128141633.502339-3-hengqi.chen@gmail.com
---
 .../selftests/bpf/prog_tests/prog_array_init.c     | 32 ++++++++++++++++++
 .../selftests/bpf/progs/test_prog_array_init.c     | 39 ++++++++++++++++++++++
 2 files changed, 71 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/prog_array_init.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_prog_array_init.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/prog_array_init.c b/tools/testing/selftests/bpf/prog_tests/prog_array_init.c
new file mode 100644
index 000000000000..fc4657619739
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/prog_array_init.c
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2021 Hengqi Chen */
+
+#include <test_progs.h>
+#include "test_prog_array_init.skel.h"
+
+void test_prog_array_init(void)
+{
+	struct test_prog_array_init *skel;
+	int err;
+
+	skel = test_prog_array_init__open();
+	if (!ASSERT_OK_PTR(skel, "could not open BPF object"))
+		return;
+
+	skel->rodata->my_pid = getpid();
+
+	err = test_prog_array_init__load(skel);
+	if (!ASSERT_OK(err, "could not load BPF object"))
+		goto cleanup;
+
+	skel->links.entry = bpf_program__attach_raw_tracepoint(skel->progs.entry, "sys_enter");
+	if (!ASSERT_OK_PTR(skel->links.entry, "could not attach BPF program"))
+		goto cleanup;
+
+	usleep(1);
+
+	ASSERT_EQ(skel->bss->value, 42, "unexpected value");
+
+cleanup:
+	test_prog_array_init__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_prog_array_init.c b/tools/testing/selftests/bpf/progs/test_prog_array_init.c
new file mode 100644
index 000000000000..2cd138356126
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_prog_array_init.c
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2021 Hengqi Chen */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+const volatile pid_t my_pid = 0;
+int value = 0;
+
+SEC("raw_tp/sys_enter")
+int tailcall_1(void *ctx)
+{
+	value = 42;
+	return 0;
+}
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+	__uint(max_entries, 2);
+	__uint(key_size, sizeof(__u32));
+	__array(values, int (void *));
+} prog_array_init SEC(".maps") = {
+	.values = {
+		[1] = (void *)&tailcall_1,
+	},
+};
+
+SEC("raw_tp/sys_enter")
+int entry(void *ctx)
+{
+	pid_t pid = bpf_get_current_pid_tgid() >> 32;
+
+	if (pid != my_pid)
+		return 0;
+
+	bpf_tail_call(ctx, &prog_array_init, 1);
+	return 0;
+}
-- 
cgit v1.2.3


From 754d71be52926563ddd5cef2a3837a5742a08c37 Mon Sep 17 00:00:00 2001
From: Ivan Vecera <ivecera@redhat.com>
Date: Mon, 29 Nov 2021 10:58:50 +0100
Subject: selftests: net: bridge: fix typo in vlan_filtering dependency test

Prior patch:
]# TESTS=vlmc_filtering_test ./bridge_vlan_mcast.sh
TEST: Vlan multicast snooping enable                                [ OK ]
Device "bridge" does not exist.
TEST: Disable multicast vlan snooping when vlan filtering is disabled   [FAIL]
        Vlan filtering is disabled but multicast vlan snooping is still enabled

After patch:
# TESTS=vlmc_filtering_test ./bridge_vlan_mcast.sh
TEST: Vlan multicast snooping enable                                [ OK ]
TEST: Disable multicast vlan snooping when vlan filtering is disabled   [ OK ]

Fixes: f5a9dd58f48b7c ("selftests: net: bridge: add test for vlan_filtering dependency")
Cc: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: Ivan Vecera <ivecera@redhat.com>
Acked-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh b/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
index 5224a5a8595b..8748d1b1d95b 100755
--- a/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
+++ b/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh
@@ -527,7 +527,7 @@ vlmc_filtering_test()
 {
 	RET=0
 	ip link set dev br0 type bridge vlan_filtering 0
-	ip -j -d link show dev bridge | \
+	ip -j -d link show dev br0 | \
 	jq -e "select(.[0].linkinfo.info_data.mcast_vlan_snooping == 1)" &>/dev/null
 	check_fail $? "Vlan filtering is disabled but multicast vlan snooping is still enabled"
 	log_test "Disable multicast vlan snooping when vlan filtering is disabled"
-- 
cgit v1.2.3


From 43174f0d4597325cb91f1f1f55263eb6e6101036 Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Mon, 29 Nov 2021 10:00:40 +0000
Subject: libbpf: Silence uninitialized warning/error in
 btf_dump_dump_type_data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When compiling libbpf with gcc 4.8.5, we see:

  CC       staticobjs/btf_dump.o
btf_dump.c: In function ‘btf_dump_dump_type_data.isra.24’:
btf_dump.c:2296:5: error: ‘err’ may be used uninitialized in this function [-Werror=maybe-uninitialized]
  if (err < 0)
     ^
cc1: all warnings being treated as errors
make: *** [staticobjs/btf_dump.o] Error 1

While gcc 4.8.5 is too old to build the upstream kernel, it's possible it
could be used to build standalone libbpf which suffers from the same problem.
Silence the error by initializing 'err' to 0.  The warning/error seems to be
a false positive since err is set early in the function.  Regardless we
shouldn't prevent libbpf from building for this.

Fixes: 920d16af9b42 ("libbpf: BTF dumper support for typed data")
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1638180040-8037-1-git-send-email-alan.maguire@oracle.com
---
 tools/lib/bpf/btf_dump.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c
index 05f3e7dfec0a..f06a1d343c92 100644
--- a/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c
@@ -2216,7 +2216,7 @@ static int btf_dump_dump_type_data(struct btf_dump *d,
 				   __u8 bits_offset,
 				   __u8 bit_sz)
 {
-	int size, err;
+	int size, err = 0;
 
 	size = btf_dump_type_data_check_overflow(d, t, id, data, bits_offset);
 	if (size < 0)
-- 
cgit v1.2.3


From 03ff1b1def73f817e196bf96ab36ac259490bd7c Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Mon, 29 Nov 2021 10:39:21 -0500
Subject: wireguard: selftests: increase default dmesg log size

The selftests currently parse the kernel log at the end to track
potential memory leaks. With these tests now reading off the end of the
buffer, due to recent optimizations, some creation messages were lost,
making the tests think that there was a free without an alloc. Fix this
by increasing the kernel log size.

Fixes: 24b70eeeb4f4 ("wireguard: use synchronize_net rather than synchronize_rcu")
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/wireguard/qemu/kernel.config | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/wireguard/qemu/kernel.config b/tools/testing/selftests/wireguard/qemu/kernel.config
index 74db83a0aedd..a9b5a520a1d2 100644
--- a/tools/testing/selftests/wireguard/qemu/kernel.config
+++ b/tools/testing/selftests/wireguard/qemu/kernel.config
@@ -66,6 +66,7 @@ CONFIG_PROC_SYSCTL=y
 CONFIG_SYSFS=y
 CONFIG_TMPFS=y
 CONFIG_CONSOLE_LOGLEVEL_DEFAULT=15
+CONFIG_LOG_BUF_SHIFT=18
 CONFIG_PRINTK_TIME=y
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_LEGACY_VSYSCALL_NONE=y
-- 
cgit v1.2.3


From 782c72af567fc2ef09bd7615d0307f24de72c7e0 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Mon, 29 Nov 2021 10:39:22 -0500
Subject: wireguard: selftests: actually test for routing loops

We previously removed the restriction on looping to self, and then added
a test to make sure the kernel didn't blow up during a routing loop. The
kernel didn't blow up, thankfully, but on certain architectures where
skb fragmentation is easier, such as ppc64, the skbs weren't actually
being discarded after a few rounds through. But the test wasn't catching
this. So actually test explicitly for massive increases in tx to see if
we have a routing loop. Note that the actual loop problem will need to
be addressed in a different commit.

Fixes: b673e24aad36 ("wireguard: socket: remove errant restriction on looping to self")
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/wireguard/netns.sh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/wireguard/netns.sh b/tools/testing/selftests/wireguard/netns.sh
index ebc4ee0fe179..2e5c1630885e 100755
--- a/tools/testing/selftests/wireguard/netns.sh
+++ b/tools/testing/selftests/wireguard/netns.sh
@@ -276,7 +276,11 @@ n0 ping -W 1 -c 1 192.168.241.2
 n1 wg set wg0 peer "$pub2" endpoint 192.168.241.2:7
 ip2 link del wg0
 ip2 link del wg1
-! n0 ping -W 1 -c 10 -f 192.168.241.2 || false # Should not crash kernel
+read _ _ tx_bytes_before < <(n0 wg show wg1 transfer)
+! n0 ping -W 1 -c 10 -f 192.168.241.2 || false
+sleep 1
+read _ _ tx_bytes_after < <(n0 wg show wg1 transfer)
+(( tx_bytes_after - tx_bytes_before < 70000 ))
 
 ip0 link del wg1
 ip1 link del wg0
-- 
cgit v1.2.3


From 7e938beb8321d34f040557b8915b228af125f73c Mon Sep 17 00:00:00 2001
From: Li Zhijian <lizhijian@cn.fujitsu.com>
Date: Mon, 29 Nov 2021 10:39:24 -0500
Subject: wireguard: selftests: rename DEBUG_PI_LIST to DEBUG_PLIST

DEBUG_PI_LIST was renamed to DEBUG_PLIST since 8e18faeac3 ("lib/plist:
rename DEBUG_PI_LIST to DEBUG_PLIST").

Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Fixes: 8e18faeac3e4 ("lib/plist: rename DEBUG_PI_LIST to DEBUG_PLIST")
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/wireguard/qemu/debug.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/wireguard/qemu/debug.config b/tools/testing/selftests/wireguard/qemu/debug.config
index fe07d97df9fa..2b321b8a96cf 100644
--- a/tools/testing/selftests/wireguard/qemu/debug.config
+++ b/tools/testing/selftests/wireguard/qemu/debug.config
@@ -47,7 +47,7 @@ CONFIG_DEBUG_ATOMIC_SLEEP=y
 CONFIG_TRACE_IRQFLAGS=y
 CONFIG_DEBUG_BUGVERBOSE=y
 CONFIG_DEBUG_LIST=y
-CONFIG_DEBUG_PI_LIST=y
+CONFIG_DEBUG_PLIST=y
 CONFIG_PROVE_RCU=y
 CONFIG_SPARSE_RCU_POINTER=y
 CONFIG_RCU_CPU_STALL_TIMEOUT=21
-- 
cgit v1.2.3


From 20ae1d6aa159eb91a9bf09ff92ccaa94dbea92c2 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Mon, 29 Nov 2021 10:39:25 -0500
Subject: wireguard: device: reset peer src endpoint when netns exits
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Each peer's endpoint contains a dst_cache entry that takes a reference
to another netdev. When the containing namespace exits, we take down the
socket and prevent future sockets from being created (by setting
creating_net to NULL), which removes that potential reference on the
netns. However, it doesn't release references to the netns that a netdev
cached in dst_cache might be taking, so the netns still might fail to
exit. Since the socket is gimped anyway, we can simply clear all the
dst_caches (by way of clearing the endpoint src), which will release all
references.

However, the current dst_cache_reset function only releases those
references lazily. But it turns out that all of our usages of
wg_socket_clear_peer_endpoint_src are called from contexts that are not
exactly high-speed or bottle-necked. For example, when there's
connection difficulty, or when userspace is reconfiguring the interface.
And in particular for this patch, when the netns is exiting. So for
those cases, it makes more sense to call dst_release immediately. For
that, we add a small helper function to dst_cache.

This patch also adds a test to netns.sh from Hangbin Liu to ensure this
doesn't regress.

Tested-by: Hangbin Liu <liuhangbin@gmail.com>
Reported-by: Xiumei Mu <xmu@redhat.com>
Cc: Toke Høiland-Jørgensen <toke@redhat.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Fixes: 900575aa33a3 ("wireguard: device: avoid circular netns references")
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/wireguard/device.c             |  3 +++
 drivers/net/wireguard/socket.c             |  2 +-
 include/net/dst_cache.h                    | 11 +++++++++++
 net/core/dst_cache.c                       | 19 +++++++++++++++++++
 tools/testing/selftests/wireguard/netns.sh | 24 +++++++++++++++++++++++-
 5 files changed, 57 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/drivers/net/wireguard/device.c b/drivers/net/wireguard/device.c
index 551ddaaaf540..77e64ea6be67 100644
--- a/drivers/net/wireguard/device.c
+++ b/drivers/net/wireguard/device.c
@@ -398,6 +398,7 @@ static struct rtnl_link_ops link_ops __read_mostly = {
 static void wg_netns_pre_exit(struct net *net)
 {
 	struct wg_device *wg;
+	struct wg_peer *peer;
 
 	rtnl_lock();
 	list_for_each_entry(wg, &device_list, device_list) {
@@ -407,6 +408,8 @@ static void wg_netns_pre_exit(struct net *net)
 			mutex_lock(&wg->device_update_lock);
 			rcu_assign_pointer(wg->creating_net, NULL);
 			wg_socket_reinit(wg, NULL, NULL);
+			list_for_each_entry(peer, &wg->peer_list, peer_list)
+				wg_socket_clear_peer_endpoint_src(peer);
 			mutex_unlock(&wg->device_update_lock);
 		}
 	}
diff --git a/drivers/net/wireguard/socket.c b/drivers/net/wireguard/socket.c
index 8c496b747108..6f07b949cb81 100644
--- a/drivers/net/wireguard/socket.c
+++ b/drivers/net/wireguard/socket.c
@@ -308,7 +308,7 @@ void wg_socket_clear_peer_endpoint_src(struct wg_peer *peer)
 {
 	write_lock_bh(&peer->endpoint_lock);
 	memset(&peer->endpoint.src6, 0, sizeof(peer->endpoint.src6));
-	dst_cache_reset(&peer->endpoint_cache);
+	dst_cache_reset_now(&peer->endpoint_cache);
 	write_unlock_bh(&peer->endpoint_lock);
 }
 
diff --git a/include/net/dst_cache.h b/include/net/dst_cache.h
index 67634675e919..df6622a5fe98 100644
--- a/include/net/dst_cache.h
+++ b/include/net/dst_cache.h
@@ -79,6 +79,17 @@ static inline void dst_cache_reset(struct dst_cache *dst_cache)
 	dst_cache->reset_ts = jiffies;
 }
 
+/**
+ *	dst_cache_reset_now - invalidate the cache contents immediately
+ *	@dst_cache: the cache
+ *
+ *	The caller must be sure there are no concurrent users, as this frees
+ *	all dst_cache users immediately, rather than waiting for the next
+ *	per-cpu usage like dst_cache_reset does. Most callers should use the
+ *	higher speed lazily-freed dst_cache_reset function instead.
+ */
+void dst_cache_reset_now(struct dst_cache *dst_cache);
+
 /**
  *	dst_cache_init - initialize the cache, allocating the required storage
  *	@dst_cache: the cache
diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c
index be74ab4551c2..0ccfd5fa5cb9 100644
--- a/net/core/dst_cache.c
+++ b/net/core/dst_cache.c
@@ -162,3 +162,22 @@ void dst_cache_destroy(struct dst_cache *dst_cache)
 	free_percpu(dst_cache->cache);
 }
 EXPORT_SYMBOL_GPL(dst_cache_destroy);
+
+void dst_cache_reset_now(struct dst_cache *dst_cache)
+{
+	int i;
+
+	if (!dst_cache->cache)
+		return;
+
+	dst_cache->reset_ts = jiffies;
+	for_each_possible_cpu(i) {
+		struct dst_cache_pcpu *idst = per_cpu_ptr(dst_cache->cache, i);
+		struct dst_entry *dst = idst->dst;
+
+		idst->cookie = 0;
+		idst->dst = NULL;
+		dst_release(dst);
+	}
+}
+EXPORT_SYMBOL_GPL(dst_cache_reset_now);
diff --git a/tools/testing/selftests/wireguard/netns.sh b/tools/testing/selftests/wireguard/netns.sh
index 2e5c1630885e..8a9461aa0878 100755
--- a/tools/testing/selftests/wireguard/netns.sh
+++ b/tools/testing/selftests/wireguard/netns.sh
@@ -613,6 +613,28 @@ ip0 link set wg0 up
 kill $ncat_pid
 ip0 link del wg0
 
+# Ensure that dst_cache references don't outlive netns lifetime
+ip1 link add dev wg0 type wireguard
+ip2 link add dev wg0 type wireguard
+configure_peers
+ip1 link add veth1 type veth peer name veth2
+ip1 link set veth2 netns $netns2
+ip1 addr add fd00:aa::1/64 dev veth1
+ip2 addr add fd00:aa::2/64 dev veth2
+ip1 link set veth1 up
+ip2 link set veth2 up
+waitiface $netns1 veth1
+waitiface $netns2 veth2
+ip1 -6 route add default dev veth1 via fd00:aa::2
+ip2 -6 route add default dev veth2 via fd00:aa::1
+n1 wg set wg0 peer "$pub2" endpoint [fd00:aa::2]:2
+n2 wg set wg0 peer "$pub1" endpoint [fd00:aa::1]:1
+n1 ping6 -c 1 fd00::2
+pp ip netns delete $netns1
+pp ip netns delete $netns2
+pp ip netns add $netns1
+pp ip netns add $netns2
+
 # Ensure there aren't circular reference loops
 ip1 link add wg1 type wireguard
 ip2 link add wg2 type wireguard
@@ -631,7 +653,7 @@ while read -t 0.1 -r line 2>/dev/null || [[ $? -ne 142 ]]; do
 done < /dev/kmsg
 alldeleted=1
 for object in "${!objects[@]}"; do
-	if [[ ${objects["$object"]} != *createddestroyed ]]; then
+	if [[ ${objects["$object"]} != *createddestroyed && ${objects["$object"]} != *createdcreateddestroyeddestroyed ]]; then
 		echo "Error: $object: merely ${objects["$object"]}" >&3
 		alldeleted=0
 	fi
-- 
cgit v1.2.3


From 81835ee113e92683160030fe3328f3c3187a92c2 Mon Sep 17 00:00:00 2001
From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
Date: Fri, 26 Nov 2021 16:28:31 +0100
Subject: KVM: selftests: page_table_test: fix calculation of
 guest_test_phys_mem

A kvm_page_table_test run with its default settings fails on VMX due to
memory region add failure:
> ==== Test Assertion Failure ====
>  lib/kvm_util.c:952: ret == 0
>  pid=10538 tid=10538 errno=17 - File exists
>     1  0x00000000004057d1: vm_userspace_mem_region_add at kvm_util.c:947
>     2  0x0000000000401ee9: pre_init_before_test at kvm_page_table_test.c:302
>     3   (inlined by) run_test at kvm_page_table_test.c:374
>     4  0x0000000000409754: for_each_guest_mode at guest_modes.c:53
>     5  0x0000000000401860: main at kvm_page_table_test.c:500
>     6  0x00007f82ae2d8554: ?? ??:0
>     7  0x0000000000401894: _start at ??:?
>  KVM_SET_USER_MEMORY_REGION IOCTL failed,
>  rc: -1 errno: 17
>  slot: 1 flags: 0x0
>  guest_phys_addr: 0xc0000000 size: 0x40000000

This is because the memory range that this test is trying to add
(0x0c0000000 - 0x100000000) conflicts with LAPIC mapping at 0x0fee00000.

Looking at the code it seems that guest_test_*phys*_mem variable gets
mistakenly overwritten with guest_test_*virt*_mem while trying to adjust
the former for alignment.
With the correct variable adjusted this test runs successfully.

Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
Message-Id: <52e487458c3172923549bbcf9dfccfbe6faea60b.1637940473.git.maciej.szmigiero@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/kvm_page_table_test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c
index 3836322add00..ba1fdc3dcf4a 100644
--- a/tools/testing/selftests/kvm/kvm_page_table_test.c
+++ b/tools/testing/selftests/kvm/kvm_page_table_test.c
@@ -280,7 +280,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg)
 #ifdef __s390x__
 	alignment = max(0x100000, alignment);
 #endif
-	guest_test_phys_mem = align_down(guest_test_virt_mem, alignment);
+	guest_test_phys_mem = align_down(guest_test_phys_mem, alignment);
 
 	/* Set up the shared data structure test_args */
 	test_args.vm = vm;
-- 
cgit v1.2.3


From dc79c9f4eb6b4f4584ba0f6f334b907283ed4b6c Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 22 Nov 2021 19:50:32 -0500
Subject: selftests: sev_migrate_tests: add tests for
 KVM_CAP_VM_COPY_ENC_CONTEXT_FROM

I am putting the tests in sev_migrate_tests because the failure conditions are
very similar and some of the setup code can be reused, too.

The tests cover both successful creation of a mirror VM, and error
conditions.

Cc: Peter Gonda <pgonda@google.com>
Cc: Sean Christopherson <seanjc@google.com>
Message-Id: <20211123005036.2954379-9-pbonzini@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 .../selftests/kvm/x86_64/sev_migrate_tests.c       | 112 +++++++++++++++++++--
 1 file changed, 105 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
index 0cd7e2eaa895..d265cea5de85 100644
--- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
+++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
@@ -54,12 +54,15 @@ static struct kvm_vm *sev_vm_create(bool es)
 	return vm;
 }
 
-static struct kvm_vm *__vm_create(void)
+static struct kvm_vm *aux_vm_create(bool with_vcpus)
 {
 	struct kvm_vm *vm;
 	int i;
 
 	vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR);
+	if (!with_vcpus)
+		return vm;
+
 	for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i)
 		vm_vcpu_add(vm, i);
 
@@ -93,7 +96,7 @@ static void test_sev_migrate_from(bool es)
 
 	src_vm = sev_vm_create(es);
 	for (i = 0; i < NR_MIGRATE_TEST_VMS; ++i)
-		dst_vms[i] = __vm_create();
+		dst_vms[i] = aux_vm_create(true);
 
 	/* Initial migration from the src to the first dst. */
 	sev_migrate_from(dst_vms[0]->fd, src_vm->fd);
@@ -162,7 +165,7 @@ static void test_sev_migrate_parameters(void)
 	sev_vm = sev_vm_create(/* es= */ false);
 	sev_es_vm = sev_vm_create(/* es= */ true);
 	vm_no_vcpu = vm_create(VM_MODE_DEFAULT, 0, O_RDWR);
-	vm_no_sev = __vm_create();
+	vm_no_sev = aux_vm_create(true);
 	sev_es_vm_no_vmsa = vm_create(VM_MODE_DEFAULT, 0, O_RDWR);
 	sev_ioctl(sev_es_vm_no_vmsa->fd, KVM_SEV_ES_INIT, NULL);
 	vm_vcpu_add(sev_es_vm_no_vmsa, 1);
@@ -203,11 +206,106 @@ static void test_sev_migrate_parameters(void)
 	kvm_vm_free(vm_no_sev);
 }
 
+static int __sev_mirror_create(int dst_fd, int src_fd)
+{
+	struct kvm_enable_cap cap = {
+		.cap = KVM_CAP_VM_COPY_ENC_CONTEXT_FROM,
+		.args = { src_fd }
+	};
+
+	return ioctl(dst_fd, KVM_ENABLE_CAP, &cap);
+}
+
+
+static void sev_mirror_create(int dst_fd, int src_fd)
+{
+	int ret;
+
+	ret = __sev_mirror_create(dst_fd, src_fd);
+	TEST_ASSERT(!ret, "Copying context failed, ret: %d, errno: %d\n", ret, errno);
+}
+
+static void test_sev_mirror(bool es)
+{
+	struct kvm_vm *src_vm, *dst_vm;
+	struct kvm_sev_launch_start start = {
+		.policy = es ? SEV_POLICY_ES : 0
+	};
+	int i;
+
+	src_vm = sev_vm_create(es);
+	dst_vm = aux_vm_create(false);
+
+	sev_mirror_create(dst_vm->fd, src_vm->fd);
+
+	/* Check that we can complete creation of the mirror VM.  */
+	for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i)
+		vm_vcpu_add(dst_vm, i);
+	sev_ioctl(dst_vm->fd, KVM_SEV_LAUNCH_START, &start);
+	if (es)
+		sev_ioctl(dst_vm->fd, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL);
+
+	kvm_vm_free(src_vm);
+	kvm_vm_free(dst_vm);
+}
+
+static void test_sev_mirror_parameters(void)
+{
+	struct kvm_vm *sev_vm, *sev_es_vm, *vm_no_vcpu, *vm_with_vcpu;
+	int ret;
+
+	sev_vm = sev_vm_create(/* es= */ false);
+	sev_es_vm = sev_vm_create(/* es= */ true);
+	vm_with_vcpu = aux_vm_create(true);
+	vm_no_vcpu = aux_vm_create(false);
+
+	ret = __sev_mirror_create(sev_vm->fd, sev_vm->fd);
+	TEST_ASSERT(
+		ret == -1 && errno == EINVAL,
+		"Should not be able copy context to self. ret: %d, errno: %d\n",
+		ret, errno);
+
+	ret = __sev_mirror_create(sev_vm->fd, sev_es_vm->fd);
+	TEST_ASSERT(
+		ret == -1 && errno == EINVAL,
+		"Should not be able copy context to SEV enabled VM. ret: %d, errno: %d\n",
+		ret, errno);
+
+	ret = __sev_mirror_create(sev_es_vm->fd, sev_vm->fd);
+	TEST_ASSERT(
+		ret == -1 && errno == EINVAL,
+		"Should not be able copy context to SEV-ES enabled VM. ret: %d, errno: %d\n",
+		ret, errno);
+
+	ret = __sev_mirror_create(vm_no_vcpu->fd, vm_with_vcpu->fd);
+	TEST_ASSERT(ret == -1 && errno == EINVAL,
+		    "Copy context requires SEV enabled. ret %d, errno: %d\n", ret,
+		    errno);
+
+	ret = __sev_mirror_create(vm_with_vcpu->fd, sev_vm->fd);
+	TEST_ASSERT(
+		ret == -1 && errno == EINVAL,
+		"SEV copy context requires no vCPUS on the destination. ret: %d, errno: %d\n",
+		ret, errno);
+
+	kvm_vm_free(sev_vm);
+	kvm_vm_free(sev_es_vm);
+	kvm_vm_free(vm_with_vcpu);
+	kvm_vm_free(vm_no_vcpu);
+}
+
 int main(int argc, char *argv[])
 {
-	test_sev_migrate_from(/* es= */ false);
-	test_sev_migrate_from(/* es= */ true);
-	test_sev_migrate_locking();
-	test_sev_migrate_parameters();
+	if (kvm_check_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM)) {
+		test_sev_migrate_from(/* es= */ false);
+		test_sev_migrate_from(/* es= */ true);
+		test_sev_migrate_locking();
+		test_sev_migrate_parameters();
+	}
+	if (kvm_check_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)) {
+		test_sev_mirror(/* es= */ false);
+		test_sev_mirror(/* es= */ true);
+		test_sev_mirror_parameters();
+	}
 	return 0;
 }
-- 
cgit v1.2.3


From 17d44a96f000fe1040d4ba1c34e458c63be6b7ce Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 22 Nov 2021 19:50:34 -0500
Subject: KVM: SEV: Prohibit migration of a VM that has mirrors

VMs that mirror an encryption context rely on the owner to keep the
ASID allocated.  Performing a KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM
would cause a dangling ASID:

1. copy context from A to B (gets ref to A)
2. move context from A to L (moves ASID from A to L)
3. close L (releases ASID from L, B still references it)

The right way to do the handoff instead is to create a fresh mirror VM
on the destination first:

1. copy context from A to B (gets ref to A)
[later] 2. close B (releases ref to A)
3. move context from A to L (moves ASID from A to L)
4. copy context from L to M

So, catch the situation by adding a count of how many VMs are
mirroring this one's encryption context.

Fixes: 0b020f5af092 ("KVM: SEV: Add support for SEV-ES intra host migration")
Message-Id: <20211123005036.2954379-11-pbonzini@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/sev.c                             | 22 ++++++++++++-
 arch/x86/kvm/svm/svm.h                             |  1 +
 .../selftests/kvm/x86_64/sev_migrate_tests.c       | 37 ++++++++++++++++++++++
 3 files changed, 59 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 025d9731b66c..89a716290fac 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -1696,6 +1696,16 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd)
 	}
 
 	src_sev = &to_kvm_svm(source_kvm)->sev_info;
+
+	/*
+	 * VMs mirroring src's encryption context rely on it to keep the
+	 * ASID allocated, but below we are clearing src_sev->asid.
+	 */
+	if (src_sev->num_mirrored_vms) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+
 	dst_sev->misc_cg = get_current_misc_cg();
 	cg_cleanup_sev = dst_sev;
 	if (dst_sev->misc_cg != src_sev->misc_cg) {
@@ -1987,6 +1997,7 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
 	 */
 	source_sev = &to_kvm_svm(source_kvm)->sev_info;
 	kvm_get_kvm(source_kvm);
+	source_sev->num_mirrored_vms++;
 
 	/* Set enc_context_owner and copy its encryption context over */
 	mirror_sev = &to_kvm_svm(kvm)->sev_info;
@@ -2019,12 +2030,21 @@ void sev_vm_destroy(struct kvm *kvm)
 	struct list_head *head = &sev->regions_list;
 	struct list_head *pos, *q;
 
+	WARN_ON(sev->num_mirrored_vms);
+
 	if (!sev_guest(kvm))
 		return;
 
 	/* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
 	if (is_mirroring_enc_context(kvm)) {
-		kvm_put_kvm(sev->enc_context_owner);
+		struct kvm *owner_kvm = sev->enc_context_owner;
+		struct kvm_sev_info *owner_sev = &to_kvm_svm(owner_kvm)->sev_info;
+
+		mutex_lock(&owner_kvm->lock);
+		if (!WARN_ON(!owner_sev->num_mirrored_vms))
+			owner_sev->num_mirrored_vms--;
+		mutex_unlock(&owner_kvm->lock);
+		kvm_put_kvm(owner_kvm);
 		return;
 	}
 
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 5faad3dc10e2..1c7306c370fa 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -79,6 +79,7 @@ struct kvm_sev_info {
 	struct list_head regions_list;  /* List of registered regions */
 	u64 ap_jump_table;	/* SEV-ES AP Jump Table address */
 	struct kvm *enc_context_owner; /* Owner of copied encryption context */
+	unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */
 	struct misc_cg *misc_cg; /* For misc cgroup accounting */
 	atomic_t migration_in_progress;
 };
diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
index d265cea5de85..29b18d565cf4 100644
--- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
+++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
@@ -294,6 +294,41 @@ static void test_sev_mirror_parameters(void)
 	kvm_vm_free(vm_no_vcpu);
 }
 
+static void test_sev_move_copy(void)
+{
+	struct kvm_vm *dst_vm, *sev_vm, *mirror_vm, *dst_mirror_vm;
+	int ret;
+
+	sev_vm = sev_vm_create(/* es= */ false);
+	dst_vm = aux_vm_create(true);
+	mirror_vm = aux_vm_create(false);
+	dst_mirror_vm = aux_vm_create(false);
+
+	sev_mirror_create(mirror_vm->fd, sev_vm->fd);
+	ret = __sev_migrate_from(dst_vm->fd, sev_vm->fd);
+	TEST_ASSERT(ret == -1 && errno == EBUSY,
+		    "Cannot migrate VM that has mirrors. ret %d, errno: %d\n", ret,
+		    errno);
+
+	/* The mirror itself can be migrated.  */
+	sev_migrate_from(dst_mirror_vm->fd, mirror_vm->fd);
+	ret = __sev_migrate_from(dst_vm->fd, sev_vm->fd);
+	TEST_ASSERT(ret == -1 && errno == EBUSY,
+		    "Cannot migrate VM that has mirrors. ret %d, errno: %d\n", ret,
+		    errno);
+
+	/*
+	 * mirror_vm is not a mirror anymore, dst_mirror_vm is.  Thus,
+	 * the owner can be copied as soon as dst_mirror_vm is gone.
+	 */
+	kvm_vm_free(dst_mirror_vm);
+	sev_migrate_from(dst_vm->fd, sev_vm->fd);
+
+	kvm_vm_free(mirror_vm);
+	kvm_vm_free(dst_vm);
+	kvm_vm_free(sev_vm);
+}
+
 int main(int argc, char *argv[])
 {
 	if (kvm_check_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM)) {
@@ -301,6 +336,8 @@ int main(int argc, char *argv[])
 		test_sev_migrate_from(/* es= */ true);
 		test_sev_migrate_locking();
 		test_sev_migrate_parameters();
+		if (kvm_check_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM))
+			test_sev_move_copy();
 	}
 	if (kvm_check_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)) {
 		test_sev_mirror(/* es= */ false);
-- 
cgit v1.2.3


From 5944b5abd8646e8c6ac6af2b55f87dede1dae898 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Tue, 30 Nov 2021 12:29:47 +0800
Subject: Bonding: add arp_missed_max option

Currently, we use hard code number to verify if we are in the
arp_interval timeslice. But some user may want to reduce/extend
the verify timeslice. With the similar team option 'missed_max'
the uers could change that number based on their own environment.

Acked-by: Jay Vosburgh <jay.vosburgh@canonical.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/bonding.rst | 11 +++++++++++
 drivers/net/bonding/bond_main.c      | 17 +++++++++--------
 drivers/net/bonding/bond_netlink.c   | 15 +++++++++++++++
 drivers/net/bonding/bond_options.c   | 28 ++++++++++++++++++++++++++++
 drivers/net/bonding/bond_procfs.c    |  2 ++
 drivers/net/bonding/bond_sysfs.c     | 13 +++++++++++++
 include/net/bond_options.h           |  1 +
 include/net/bonding.h                |  1 +
 include/uapi/linux/if_link.h         |  1 +
 tools/include/uapi/linux/if_link.h   |  1 +
 10 files changed, 82 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst
index 31cfd7d674a6..e5a07cb48f68 100644
--- a/Documentation/networking/bonding.rst
+++ b/Documentation/networking/bonding.rst
@@ -421,6 +421,17 @@ arp_all_targets
 		consider the slave up only when all of the arp_ip_targets
 		are reachable
 
+arp_missed_max
+
+	Specifies the number of arp_interval monitor checks that must
+	fail in order for an interface to be marked down by the ARP monitor.
+
+	In order to provide orderly failover semantics, backup interfaces
+	are permitted an extra monitor check (i.e., they must fail
+	arp_missed_max + 1 times before being marked down).
+
+	The default value is 2, and the allowable range is 1 - 255.
+
 downdelay
 
 	Specifies the time, in milliseconds, to wait before disabling
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index cf73eacdda91..f3aa49b97f81 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3129,8 +3129,8 @@ static void bond_loadbalance_arp_mon(struct bonding *bond)
 			 * when the source ip is 0, so don't take the link down
 			 * if we don't know our ip yet
 			 */
-			if (!bond_time_in_interval(bond, trans_start, 2) ||
-			    !bond_time_in_interval(bond, slave->last_rx, 2)) {
+			if (!bond_time_in_interval(bond, trans_start, bond->params.missed_max) ||
+			    !bond_time_in_interval(bond, slave->last_rx, bond->params.missed_max)) {
 
 				bond_propose_link_state(slave, BOND_LINK_DOWN);
 				slave_state_changed = 1;
@@ -3224,7 +3224,7 @@ static int bond_ab_arp_inspect(struct bonding *bond)
 
 		/* Backup slave is down if:
 		 * - No current_arp_slave AND
-		 * - more than 3*delta since last receive AND
+		 * - more than (missed_max+1)*delta since last receive AND
 		 * - the bond has an IP address
 		 *
 		 * Note: a non-null current_arp_slave indicates
@@ -3236,20 +3236,20 @@ static int bond_ab_arp_inspect(struct bonding *bond)
 		 */
 		if (!bond_is_active_slave(slave) &&
 		    !rcu_access_pointer(bond->current_arp_slave) &&
-		    !bond_time_in_interval(bond, last_rx, 3)) {
+		    !bond_time_in_interval(bond, last_rx, bond->params.missed_max + 1)) {
 			bond_propose_link_state(slave, BOND_LINK_DOWN);
 			commit++;
 		}
 
 		/* Active slave is down if:
-		 * - more than 2*delta since transmitting OR
-		 * - (more than 2*delta since receive AND
+		 * - more than missed_max*delta since transmitting OR
+		 * - (more than missed_max*delta since receive AND
 		 *    the bond has an IP address)
 		 */
 		trans_start = dev_trans_start(slave->dev);
 		if (bond_is_active_slave(slave) &&
-		    (!bond_time_in_interval(bond, trans_start, 2) ||
-		     !bond_time_in_interval(bond, last_rx, 2))) {
+		    (!bond_time_in_interval(bond, trans_start, bond->params.missed_max) ||
+		     !bond_time_in_interval(bond, last_rx, bond->params.missed_max))) {
 			bond_propose_link_state(slave, BOND_LINK_DOWN);
 			commit++;
 		}
@@ -5822,6 +5822,7 @@ static int bond_check_params(struct bond_params *params)
 	params->arp_interval = arp_interval;
 	params->arp_validate = arp_validate_value;
 	params->arp_all_targets = arp_all_targets_value;
+	params->missed_max = 2;
 	params->updelay = updelay;
 	params->downdelay = downdelay;
 	params->peer_notif_delay = 0;
diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c
index 5d54e11d18fa..1007bf6d385d 100644
--- a/drivers/net/bonding/bond_netlink.c
+++ b/drivers/net/bonding/bond_netlink.c
@@ -110,6 +110,7 @@ static const struct nla_policy bond_policy[IFLA_BOND_MAX + 1] = {
 					    .len  = ETH_ALEN },
 	[IFLA_BOND_TLB_DYNAMIC_LB]	= { .type = NLA_U8 },
 	[IFLA_BOND_PEER_NOTIF_DELAY]    = { .type = NLA_U32 },
+	[IFLA_BOND_MISSED_MAX]		= { .type = NLA_U8 },
 };
 
 static const struct nla_policy bond_slave_policy[IFLA_BOND_SLAVE_MAX + 1] = {
@@ -453,6 +454,15 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 			return err;
 	}
 
+	if (data[IFLA_BOND_MISSED_MAX]) {
+		int missed_max = nla_get_u8(data[IFLA_BOND_MISSED_MAX]);
+
+		bond_opt_initval(&newval, missed_max);
+		err = __bond_opt_set(bond, BOND_OPT_MISSED_MAX, &newval);
+		if (err)
+			return err;
+	}
+
 	return 0;
 }
 
@@ -515,6 +525,7 @@ static size_t bond_get_size(const struct net_device *bond_dev)
 		nla_total_size(ETH_ALEN) + /* IFLA_BOND_AD_ACTOR_SYSTEM */
 		nla_total_size(sizeof(u8)) + /* IFLA_BOND_TLB_DYNAMIC_LB */
 		nla_total_size(sizeof(u32)) +	/* IFLA_BOND_PEER_NOTIF_DELAY */
+		nla_total_size(sizeof(u8)) +	/* IFLA_BOND_MISSED_MAX */
 		0;
 }
 
@@ -650,6 +661,10 @@ static int bond_fill_info(struct sk_buff *skb,
 		       bond->params.tlb_dynamic_lb))
 		goto nla_put_failure;
 
+	if (nla_put_u8(skb, IFLA_BOND_MISSED_MAX,
+		       bond->params.missed_max))
+		goto nla_put_failure;
+
 	if (BOND_MODE(bond) == BOND_MODE_8023AD) {
 		struct ad_info info;
 
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index a8fde3bc458f..0f48921c4f15 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -78,6 +78,8 @@ static int bond_option_ad_actor_system_set(struct bonding *bond,
 					   const struct bond_opt_value *newval);
 static int bond_option_ad_user_port_key_set(struct bonding *bond,
 					    const struct bond_opt_value *newval);
+static int bond_option_missed_max_set(struct bonding *bond,
+				      const struct bond_opt_value *newval);
 
 
 static const struct bond_opt_value bond_mode_tbl[] = {
@@ -213,6 +215,13 @@ static const struct bond_opt_value bond_ad_user_port_key_tbl[] = {
 	{ NULL,      -1,    0},
 };
 
+static const struct bond_opt_value bond_missed_max_tbl[] = {
+	{ "minval",	1,	BOND_VALFLAG_MIN},
+	{ "maxval",	255,	BOND_VALFLAG_MAX},
+	{ "default",	2,	BOND_VALFLAG_DEFAULT},
+	{ NULL,		-1,	0},
+};
+
 static const struct bond_option bond_opts[BOND_OPT_LAST] = {
 	[BOND_OPT_MODE] = {
 		.id = BOND_OPT_MODE,
@@ -270,6 +279,15 @@ static const struct bond_option bond_opts[BOND_OPT_LAST] = {
 		.values = bond_intmax_tbl,
 		.set = bond_option_arp_interval_set
 	},
+	[BOND_OPT_MISSED_MAX] = {
+		.id = BOND_OPT_MISSED_MAX,
+		.name = "arp_missed_max",
+		.desc = "Maximum number of missed ARP interval",
+		.unsuppmodes = BIT(BOND_MODE_8023AD) | BIT(BOND_MODE_TLB) |
+			       BIT(BOND_MODE_ALB),
+		.values = bond_missed_max_tbl,
+		.set = bond_option_missed_max_set
+	},
 	[BOND_OPT_ARP_TARGETS] = {
 		.id = BOND_OPT_ARP_TARGETS,
 		.name = "arp_ip_target",
@@ -1186,6 +1204,16 @@ static int bond_option_arp_all_targets_set(struct bonding *bond,
 	return 0;
 }
 
+static int bond_option_missed_max_set(struct bonding *bond,
+				      const struct bond_opt_value *newval)
+{
+	netdev_dbg(bond->dev, "Setting missed max to %s (%llu)\n",
+		   newval->string, newval->value);
+	bond->params.missed_max = newval->value;
+
+	return 0;
+}
+
 static int bond_option_primary_set(struct bonding *bond,
 				   const struct bond_opt_value *newval)
 {
diff --git a/drivers/net/bonding/bond_procfs.c b/drivers/net/bonding/bond_procfs.c
index f3e3bfd72556..2ec11af5f0cc 100644
--- a/drivers/net/bonding/bond_procfs.c
+++ b/drivers/net/bonding/bond_procfs.c
@@ -115,6 +115,8 @@ static void bond_info_show_master(struct seq_file *seq)
 
 		seq_printf(seq, "ARP Polling Interval (ms): %d\n",
 				bond->params.arp_interval);
+		seq_printf(seq, "ARP Missed Max: %u\n",
+				bond->params.missed_max);
 
 		seq_printf(seq, "ARP IP target/s (n.n.n.n form):");
 
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index c48b77167fab..9b5a5df23d21 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -303,6 +303,18 @@ static ssize_t bonding_show_arp_targets(struct device *d,
 static DEVICE_ATTR(arp_ip_target, 0644,
 		   bonding_show_arp_targets, bonding_sysfs_store_option);
 
+/* Show the arp missed max. */
+static ssize_t bonding_show_missed_max(struct device *d,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	struct bonding *bond = to_bond(d);
+
+	return sprintf(buf, "%u\n", bond->params.missed_max);
+}
+static DEVICE_ATTR(arp_missed_max, 0644,
+		   bonding_show_missed_max, bonding_sysfs_store_option);
+
 /* Show the up and down delays. */
 static ssize_t bonding_show_downdelay(struct device *d,
 				      struct device_attribute *attr,
@@ -779,6 +791,7 @@ static struct attribute *per_bond_attrs[] = {
 	&dev_attr_ad_actor_sys_prio.attr,
 	&dev_attr_ad_actor_system.attr,
 	&dev_attr_ad_user_port_key.attr,
+	&dev_attr_arp_missed_max.attr,
 	NULL,
 };
 
diff --git a/include/net/bond_options.h b/include/net/bond_options.h
index e64833a674eb..dd75c071f67e 100644
--- a/include/net/bond_options.h
+++ b/include/net/bond_options.h
@@ -65,6 +65,7 @@ enum {
 	BOND_OPT_NUM_PEER_NOTIF_ALIAS,
 	BOND_OPT_PEER_NOTIF_DELAY,
 	BOND_OPT_LACP_ACTIVE,
+	BOND_OPT_MISSED_MAX,
 	BOND_OPT_LAST
 };
 
diff --git a/include/net/bonding.h b/include/net/bonding.h
index 15e083e18f75..f6ae3a4baea4 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -121,6 +121,7 @@ struct bond_params {
 	int xmit_policy;
 	int miimon;
 	u8 num_peer_notif;
+	u8 missed_max;
 	int arp_interval;
 	int arp_validate;
 	int arp_all_targets;
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index eebd3894fe89..4ac53b30b6dc 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -858,6 +858,7 @@ enum {
 	IFLA_BOND_TLB_DYNAMIC_LB,
 	IFLA_BOND_PEER_NOTIF_DELAY,
 	IFLA_BOND_AD_LACP_ACTIVE,
+	IFLA_BOND_MISSED_MAX,
 	__IFLA_BOND_MAX,
 };
 
diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index eebd3894fe89..4ac53b30b6dc 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -858,6 +858,7 @@ enum {
 	IFLA_BOND_TLB_DYNAMIC_LB,
 	IFLA_BOND_PEER_NOTIF_DELAY,
 	IFLA_BOND_AD_LACP_ACTIVE,
+	IFLA_BOND_MISSED_MAX,
 	__IFLA_BOND_MAX,
 };
 
-- 
cgit v1.2.3


From c291d0a4d169811898d723cfa5f1aa1fc60e607c Mon Sep 17 00:00:00 2001
From: Mehrdad Arshad Rad <arshad.rad@gmail.com>
Date: Sun, 28 Nov 2021 11:33:37 -0800
Subject: libbpf: Remove duplicate assignments

There is a same action when load_attr.attach_btf_id is initialized.

Signed-off-by: Mehrdad Arshad Rad <arshad.rad@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211128193337.10628-1-arshad.rad@gmail.com
---
 tools/lib/bpf/libbpf.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index b59fede08ba7..5a2f5a6ae2f9 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -6559,7 +6559,6 @@ static int bpf_object_load_prog_instance(struct bpf_object *obj, struct bpf_prog
 	load_attr.expected_attach_type = prog->expected_attach_type;
 	if (kernel_supports(obj, FEAT_PROG_NAME))
 		prog_name = prog->name;
-	load_attr.attach_btf_id = prog->attach_btf_id;
 	load_attr.attach_prog_fd = prog->attach_prog_fd;
 	load_attr.attach_btf_obj_fd = prog->attach_btf_obj_fd;
 	load_attr.attach_btf_id = prog->attach_btf_id;
-- 
cgit v1.2.3


From d6e6a27d960f9f07aef0b979c49c6736ede28f75 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 30 Nov 2021 14:13:16 +0000
Subject: tools: Fix math.h breakage

Commit 98e1385ef24b ("include/linux/radix-tree.h: replace kernel.h with
the necessary inclusions") broke the radix tree test suite in two
different ways; first by including math.h which didn't exist in the
tools directory, and second by removing an implicit include of
spinlock.h before lockdep.h.  Fix both issues.

Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/include/linux/kernel.h             | 22 +---------------------
 tools/include/linux/math.h               | 25 +++++++++++++++++++++++++
 tools/testing/radix-tree/linux/lockdep.h |  3 +++
 3 files changed, 29 insertions(+), 21 deletions(-)
 create mode 100644 tools/include/linux/math.h

(limited to 'tools')

diff --git a/tools/include/linux/kernel.h b/tools/include/linux/kernel.h
index a7e54a08fb54..3e8df500cfbd 100644
--- a/tools/include/linux/kernel.h
+++ b/tools/include/linux/kernel.h
@@ -7,6 +7,7 @@
 #include <assert.h>
 #include <linux/build_bug.h>
 #include <linux/compiler.h>
+#include <linux/math.h>
 #include <endian.h>
 #include <byteswap.h>
 
@@ -14,8 +15,6 @@
 #define UINT_MAX	(~0U)
 #endif
 
-#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
-
 #define PERF_ALIGN(x, a)	__PERF_ALIGN_MASK(x, (typeof(x))(a)-1)
 #define __PERF_ALIGN_MASK(x, mask)	(((x)+(mask))&~(mask))
 
@@ -52,15 +51,6 @@
 	_min1 < _min2 ? _min1 : _min2; })
 #endif
 
-#ifndef roundup
-#define roundup(x, y) (                                \
-{                                                      \
-	const typeof(y) __y = y;		       \
-	(((x) + (__y - 1)) / __y) * __y;	       \
-}                                                      \
-)
-#endif
-
 #ifndef BUG_ON
 #ifdef NDEBUG
 #define BUG_ON(cond) do { if (cond) {} } while (0)
@@ -104,16 +94,6 @@ int scnprintf_pad(char * buf, size_t size, const char * fmt, ...);
 
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr))
 
-/*
- * This looks more complex than it should be. But we need to
- * get the type for the ~ right in round_down (it needs to be
- * as wide as the result!), and we want to evaluate the macro
- * arguments just once each.
- */
-#define __round_mask(x, y) ((__typeof__(x))((y)-1))
-#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
-#define round_down(x, y) ((x) & ~__round_mask(x, y))
-
 #define current_gfp_context(k) 0
 #define synchronize_rcu()
 
diff --git a/tools/include/linux/math.h b/tools/include/linux/math.h
new file mode 100644
index 000000000000..4e7af99ec9eb
--- /dev/null
+++ b/tools/include/linux/math.h
@@ -0,0 +1,25 @@
+#ifndef _TOOLS_MATH_H
+#define _TOOLS_MATH_H
+
+/*
+ * This looks more complex than it should be. But we need to
+ * get the type for the ~ right in round_down (it needs to be
+ * as wide as the result!), and we want to evaluate the macro
+ * arguments just once each.
+ */
+#define __round_mask(x, y) ((__typeof__(x))((y)-1))
+#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
+#define round_down(x, y) ((x) & ~__round_mask(x, y))
+
+#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+
+#ifndef roundup
+#define roundup(x, y) (                                \
+{                                                      \
+	const typeof(y) __y = y;		       \
+	(((x) + (__y - 1)) / __y) * __y;	       \
+}                                                      \
+)
+#endif
+
+#endif
diff --git a/tools/testing/radix-tree/linux/lockdep.h b/tools/testing/radix-tree/linux/lockdep.h
index 565fccdfe6e9..016cff473cfc 100644
--- a/tools/testing/radix-tree/linux/lockdep.h
+++ b/tools/testing/radix-tree/linux/lockdep.h
@@ -1,5 +1,8 @@
 #ifndef _LINUX_LOCKDEP_H
 #define _LINUX_LOCKDEP_H
+
+#include <linux/spinlock.h>
+
 struct lock_class_key {
 	unsigned int a;
 };
-- 
cgit v1.2.3


From e6f2dd0f80674e9d5960337b3e9c2a242441b326 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannekoong@fb.com>
Date: Mon, 29 Nov 2021 19:06:19 -0800
Subject: bpf: Add bpf_loop helper

This patch adds the kernel-side and API changes for a new helper
function, bpf_loop:

long bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx,
u64 flags);

where long (*callback_fn)(u32 index, void *ctx);

bpf_loop invokes the "callback_fn" **nr_loops** times or until the
callback_fn returns 1. The callback_fn can only return 0 or 1, and
this is enforced by the verifier. The callback_fn index is zero-indexed.

A few things to please note:
~ The "u64 flags" parameter is currently unused but is included in
case a future use case for it arises.
~ In the kernel-side implementation of bpf_loop (kernel/bpf/bpf_iter.c),
bpf_callback_t is used as the callback function cast.
~ A program can have nested bpf_loop calls but the program must
still adhere to the verifier constraint of its stack depth (the stack depth
cannot exceed MAX_BPF_STACK))
~ Recursive callback_fns do not pass the verifier, due to the call stack
for these being too deep.
~ The next patch will include the tests and benchmark

Signed-off-by: Joanne Koong <joannekoong@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211130030622.4131246-2-joannekoong@fb.com
---
 include/linux/bpf.h            |  1 +
 include/uapi/linux/bpf.h       | 25 ++++++++++++
 kernel/bpf/bpf_iter.c          | 35 +++++++++++++++++
 kernel/bpf/helpers.c           |  2 +
 kernel/bpf/verifier.c          | 88 ++++++++++++++++++++++++++----------------
 tools/include/uapi/linux/bpf.h | 25 ++++++++++++
 6 files changed, 142 insertions(+), 34 deletions(-)

(limited to 'tools')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cc7a0c36e7df..cad0829710be 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2164,6 +2164,7 @@ extern const struct bpf_func_proto bpf_sk_setsockopt_proto;
 extern const struct bpf_func_proto bpf_sk_getsockopt_proto;
 extern const struct bpf_func_proto bpf_kallsyms_lookup_name_proto;
 extern const struct bpf_func_proto bpf_find_vma_proto;
+extern const struct bpf_func_proto bpf_loop_proto;
 
 const struct bpf_func_proto *tracing_prog_func_proto(
   enum bpf_func_id func_id, const struct bpf_prog *prog);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a69e4b04ffeb..211b43afd0fb 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4957,6 +4957,30 @@ union bpf_attr {
  *		**-ENOENT** if *task->mm* is NULL, or no vma contains *addr*.
  *		**-EBUSY** if failed to try lock mmap_lock.
  *		**-EINVAL** for invalid **flags**.
+ *
+ * long bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx, u64 flags)
+ *	Description
+ *		For **nr_loops**, call **callback_fn** function
+ *		with **callback_ctx** as the context parameter.
+ *		The **callback_fn** should be a static function and
+ *		the **callback_ctx** should be a pointer to the stack.
+ *		The **flags** is used to control certain aspects of the helper.
+ *		Currently, the **flags** must be 0. Currently, nr_loops is
+ *		limited to 1 << 23 (~8 million) loops.
+ *
+ *		long (\*callback_fn)(u32 index, void \*ctx);
+ *
+ *		where **index** is the current index in the loop. The index
+ *		is zero-indexed.
+ *
+ *		If **callback_fn** returns 0, the helper will continue to the next
+ *		loop. If return value is 1, the helper will skip the rest of
+ *		the loops and return. Other return values are not used now,
+ *		and will be rejected by the verifier.
+ *
+ *	Return
+ *		The number of loops performed, **-EINVAL** for invalid **flags**,
+ *		**-E2BIG** if **nr_loops** exceeds the maximum number of loops.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5140,6 +5164,7 @@ union bpf_attr {
 	FN(skc_to_unix_sock),		\
 	FN(kallsyms_lookup_name),	\
 	FN(find_vma),			\
+	FN(loop),			\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index b2ee45064e06..b7aef5b3416d 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -714,3 +714,38 @@ const struct bpf_func_proto bpf_for_each_map_elem_proto = {
 	.arg3_type	= ARG_PTR_TO_STACK_OR_NULL,
 	.arg4_type	= ARG_ANYTHING,
 };
+
+/* maximum number of loops */
+#define MAX_LOOPS	BIT(23)
+
+BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
+	   u64, flags)
+{
+	bpf_callback_t callback = (bpf_callback_t)callback_fn;
+	u64 ret;
+	u32 i;
+
+	if (flags)
+		return -EINVAL;
+	if (nr_loops > MAX_LOOPS)
+		return -E2BIG;
+
+	for (i = 0; i < nr_loops; i++) {
+		ret = callback((u64)i, (u64)(long)callback_ctx, 0, 0, 0);
+		/* return value: 0 - continue, 1 - stop and return */
+		if (ret)
+			return i + 1;
+	}
+
+	return i;
+}
+
+const struct bpf_func_proto bpf_loop_proto = {
+	.func		= bpf_loop,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_ANYTHING,
+	.arg2_type	= ARG_PTR_TO_FUNC,
+	.arg3_type	= ARG_PTR_TO_STACK_OR_NULL,
+	.arg4_type	= ARG_ANYTHING,
+};
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1ffd469c217f..52188004a9c3 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1378,6 +1378,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_ringbuf_query_proto;
 	case BPF_FUNC_for_each_map_elem:
 		return &bpf_for_each_map_elem_proto;
+	case BPF_FUNC_loop:
+		return &bpf_loop_proto;
 	default:
 		break;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0763cca139a7..d7678d8a925c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6085,6 +6085,27 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env,
 	return 0;
 }
 
+static int set_loop_callback_state(struct bpf_verifier_env *env,
+				   struct bpf_func_state *caller,
+				   struct bpf_func_state *callee,
+				   int insn_idx)
+{
+	/* bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx,
+	 *	    u64 flags);
+	 * callback_fn(u32 index, void *callback_ctx);
+	 */
+	callee->regs[BPF_REG_1].type = SCALAR_VALUE;
+	callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
+
+	/* unused */
+	__mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
+	__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+
+	callee->in_callback_fn = true;
+	return 0;
+}
+
 static int set_timer_callback_state(struct bpf_verifier_env *env,
 				    struct bpf_func_state *caller,
 				    struct bpf_func_state *callee,
@@ -6458,13 +6479,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 			return err;
 	}
 
-	if (func_id == BPF_FUNC_tail_call) {
-		err = check_reference_leak(env);
-		if (err) {
-			verbose(env, "tail_call would lead to reference leak\n");
-			return err;
-		}
-	} else if (is_release_function(func_id)) {
+	if (is_release_function(func_id)) {
 		err = release_reference(env, meta.ref_obj_id);
 		if (err) {
 			verbose(env, "func %s#%d reference has not been acquired before\n",
@@ -6475,42 +6490,47 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 
 	regs = cur_regs(env);
 
-	/* check that flags argument in get_local_storage(map, flags) is 0,
-	 * this is required because get_local_storage() can't return an error.
-	 */
-	if (func_id == BPF_FUNC_get_local_storage &&
-	    !register_is_null(&regs[BPF_REG_2])) {
-		verbose(env, "get_local_storage() doesn't support non-zero flags\n");
-		return -EINVAL;
-	}
-
-	if (func_id == BPF_FUNC_for_each_map_elem) {
+	switch (func_id) {
+	case BPF_FUNC_tail_call:
+		err = check_reference_leak(env);
+		if (err) {
+			verbose(env, "tail_call would lead to reference leak\n");
+			return err;
+		}
+		break;
+	case BPF_FUNC_get_local_storage:
+		/* check that flags argument in get_local_storage(map, flags) is 0,
+		 * this is required because get_local_storage() can't return an error.
+		 */
+		if (!register_is_null(&regs[BPF_REG_2])) {
+			verbose(env, "get_local_storage() doesn't support non-zero flags\n");
+			return -EINVAL;
+		}
+		break;
+	case BPF_FUNC_for_each_map_elem:
 		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
 					set_map_elem_callback_state);
-		if (err < 0)
-			return -EINVAL;
-	}
-
-	if (func_id == BPF_FUNC_timer_set_callback) {
+		break;
+	case BPF_FUNC_timer_set_callback:
 		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
 					set_timer_callback_state);
-		if (err < 0)
-			return -EINVAL;
-	}
-
-	if (func_id == BPF_FUNC_find_vma) {
+		break;
+	case BPF_FUNC_find_vma:
 		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
 					set_find_vma_callback_state);
-		if (err < 0)
-			return -EINVAL;
-	}
-
-	if (func_id == BPF_FUNC_snprintf) {
+		break;
+	case BPF_FUNC_snprintf:
 		err = check_bpf_snprintf_call(env, regs);
-		if (err < 0)
-			return err;
+		break;
+	case BPF_FUNC_loop:
+		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+					set_loop_callback_state);
+		break;
 	}
 
+	if (err)
+		return err;
+
 	/* reset caller saved regs */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
 		mark_reg_not_init(env, regs, caller_saved[i]);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index a69e4b04ffeb..211b43afd0fb 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4957,6 +4957,30 @@ union bpf_attr {
  *		**-ENOENT** if *task->mm* is NULL, or no vma contains *addr*.
  *		**-EBUSY** if failed to try lock mmap_lock.
  *		**-EINVAL** for invalid **flags**.
+ *
+ * long bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx, u64 flags)
+ *	Description
+ *		For **nr_loops**, call **callback_fn** function
+ *		with **callback_ctx** as the context parameter.
+ *		The **callback_fn** should be a static function and
+ *		the **callback_ctx** should be a pointer to the stack.
+ *		The **flags** is used to control certain aspects of the helper.
+ *		Currently, the **flags** must be 0. Currently, nr_loops is
+ *		limited to 1 << 23 (~8 million) loops.
+ *
+ *		long (\*callback_fn)(u32 index, void \*ctx);
+ *
+ *		where **index** is the current index in the loop. The index
+ *		is zero-indexed.
+ *
+ *		If **callback_fn** returns 0, the helper will continue to the next
+ *		loop. If return value is 1, the helper will skip the rest of
+ *		the loops and return. Other return values are not used now,
+ *		and will be rejected by the verifier.
+ *
+ *	Return
+ *		The number of loops performed, **-EINVAL** for invalid **flags**,
+ *		**-E2BIG** if **nr_loops** exceeds the maximum number of loops.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5140,6 +5164,7 @@ union bpf_attr {
 	FN(skc_to_unix_sock),		\
 	FN(kallsyms_lookup_name),	\
 	FN(find_vma),			\
+	FN(loop),			\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3


From 4e5070b64b375a9c1f570893cfceeba108382bef Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannekoong@fb.com>
Date: Mon, 29 Nov 2021 19:06:20 -0800
Subject: selftests/bpf: Add bpf_loop test

Add test for bpf_loop testing a variety of cases:
various nr_loops, null callback ctx, invalid flags, nested callbacks.

Signed-off-by: Joanne Koong <joannekoong@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211130030622.4131246-3-joannekoong@fb.com
---
 tools/testing/selftests/bpf/prog_tests/bpf_loop.c | 145 ++++++++++++++++++++++
 tools/testing/selftests/bpf/progs/bpf_loop.c      | 112 +++++++++++++++++
 2 files changed, 257 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_loop.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_loop.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_loop.c b/tools/testing/selftests/bpf/prog_tests/bpf_loop.c
new file mode 100644
index 000000000000..380d7a2072e3
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_loop.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+
+#include <test_progs.h>
+#include <network_helpers.h>
+#include "bpf_loop.skel.h"
+
+static void check_nr_loops(struct bpf_loop *skel)
+{
+	struct bpf_link *link;
+
+	link = bpf_program__attach(skel->progs.test_prog);
+	if (!ASSERT_OK_PTR(link, "link"))
+		return;
+
+	/* test 0 loops */
+	skel->bss->nr_loops = 0;
+
+	usleep(1);
+
+	ASSERT_EQ(skel->bss->nr_loops_returned, skel->bss->nr_loops,
+		  "0 loops");
+
+	/* test 500 loops */
+	skel->bss->nr_loops = 500;
+
+	usleep(1);
+
+	ASSERT_EQ(skel->bss->nr_loops_returned, skel->bss->nr_loops,
+		  "500 loops");
+	ASSERT_EQ(skel->bss->g_output, (500 * 499) / 2, "g_output");
+
+	/* test exceeding the max limit */
+	skel->bss->nr_loops = -1;
+
+	usleep(1);
+
+	ASSERT_EQ(skel->bss->err, -E2BIG, "over max limit");
+
+	bpf_link__destroy(link);
+}
+
+static void check_callback_fn_stop(struct bpf_loop *skel)
+{
+	struct bpf_link *link;
+
+	link = bpf_program__attach(skel->progs.test_prog);
+	if (!ASSERT_OK_PTR(link, "link"))
+		return;
+
+	/* testing that loop is stopped when callback_fn returns 1 */
+	skel->bss->nr_loops = 400;
+	skel->data->stop_index = 50;
+
+	usleep(1);
+
+	ASSERT_EQ(skel->bss->nr_loops_returned, skel->data->stop_index + 1,
+		  "nr_loops_returned");
+	ASSERT_EQ(skel->bss->g_output, (50 * 49) / 2,
+		  "g_output");
+
+	bpf_link__destroy(link);
+}
+
+static void check_null_callback_ctx(struct bpf_loop *skel)
+{
+	struct bpf_link *link;
+
+	/* check that user is able to pass in a null callback_ctx */
+	link = bpf_program__attach(skel->progs.prog_null_ctx);
+	if (!ASSERT_OK_PTR(link, "link"))
+		return;
+
+	skel->bss->nr_loops = 10;
+
+	usleep(1);
+
+	ASSERT_EQ(skel->bss->nr_loops_returned, skel->bss->nr_loops,
+		  "nr_loops_returned");
+
+	bpf_link__destroy(link);
+}
+
+static void check_invalid_flags(struct bpf_loop *skel)
+{
+	struct bpf_link *link;
+
+	/* check that passing in non-zero flags returns -EINVAL */
+	link = bpf_program__attach(skel->progs.prog_invalid_flags);
+	if (!ASSERT_OK_PTR(link, "link"))
+		return;
+
+	usleep(1);
+
+	ASSERT_EQ(skel->bss->err, -EINVAL, "err");
+
+	bpf_link__destroy(link);
+}
+
+static void check_nested_calls(struct bpf_loop *skel)
+{
+	__u32 nr_loops = 100, nested_callback_nr_loops = 4;
+	struct bpf_link *link;
+
+	/* check that nested calls are supported */
+	link = bpf_program__attach(skel->progs.prog_nested_calls);
+	if (!ASSERT_OK_PTR(link, "link"))
+		return;
+
+	skel->bss->nr_loops = nr_loops;
+	skel->bss->nested_callback_nr_loops = nested_callback_nr_loops;
+
+	usleep(1);
+
+	ASSERT_EQ(skel->bss->nr_loops_returned, nr_loops * nested_callback_nr_loops
+		  * nested_callback_nr_loops, "nr_loops_returned");
+	ASSERT_EQ(skel->bss->g_output, (4 * 3) / 2 * nested_callback_nr_loops
+		* nr_loops, "g_output");
+
+	bpf_link__destroy(link);
+}
+
+void test_bpf_loop(void)
+{
+	struct bpf_loop *skel;
+
+	skel = bpf_loop__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "bpf_loop__open_and_load"))
+		return;
+
+	skel->bss->pid = getpid();
+
+	if (test__start_subtest("check_nr_loops"))
+		check_nr_loops(skel);
+	if (test__start_subtest("check_callback_fn_stop"))
+		check_callback_fn_stop(skel);
+	if (test__start_subtest("check_null_callback_ctx"))
+		check_null_callback_ctx(skel);
+	if (test__start_subtest("check_invalid_flags"))
+		check_invalid_flags(skel);
+	if (test__start_subtest("check_nested_calls"))
+		check_nested_calls(skel);
+
+	bpf_loop__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_loop.c b/tools/testing/selftests/bpf/progs/bpf_loop.c
new file mode 100644
index 000000000000..12349e4601e8
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_loop.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct callback_ctx {
+	int output;
+};
+
+/* These should be set by the user program */
+u32 nested_callback_nr_loops;
+u32 stop_index = -1;
+u32 nr_loops;
+int pid;
+
+/* Making these global variables so that the userspace program
+ * can verify the output through the skeleton
+ */
+int nr_loops_returned;
+int g_output;
+int err;
+
+static int callback(__u32 index, void *data)
+{
+	struct callback_ctx *ctx = data;
+
+	if (index >= stop_index)
+		return 1;
+
+	ctx->output += index;
+
+	return 0;
+}
+
+static int empty_callback(__u32 index, void *data)
+{
+	return 0;
+}
+
+static int nested_callback2(__u32 index, void *data)
+{
+	nr_loops_returned += bpf_loop(nested_callback_nr_loops, callback, data, 0);
+
+	return 0;
+}
+
+static int nested_callback1(__u32 index, void *data)
+{
+	bpf_loop(nested_callback_nr_loops, nested_callback2, data, 0);
+	return 0;
+}
+
+SEC("fentry/__x64_sys_nanosleep")
+int test_prog(void *ctx)
+{
+	struct callback_ctx data = {};
+
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return 0;
+
+	nr_loops_returned = bpf_loop(nr_loops, callback, &data, 0);
+
+	if (nr_loops_returned < 0)
+		err = nr_loops_returned;
+	else
+		g_output = data.output;
+
+	return 0;
+}
+
+SEC("fentry/__x64_sys_nanosleep")
+int prog_null_ctx(void *ctx)
+{
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return 0;
+
+	nr_loops_returned = bpf_loop(nr_loops, empty_callback, NULL, 0);
+
+	return 0;
+}
+
+SEC("fentry/__x64_sys_nanosleep")
+int prog_invalid_flags(void *ctx)
+{
+	struct callback_ctx data = {};
+
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return 0;
+
+	err = bpf_loop(nr_loops, callback, &data, 1);
+
+	return 0;
+}
+
+SEC("fentry/__x64_sys_nanosleep")
+int prog_nested_calls(void *ctx)
+{
+	struct callback_ctx data = {};
+
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return 0;
+
+	nr_loops_returned = 0;
+	bpf_loop(nr_loops, nested_callback1, &data, 0);
+
+	g_output = data.output;
+
+	return 0;
+}
-- 
cgit v1.2.3


From f6e659b7f97c76d0471d12bf274ea2a097cf3c5c Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannekoong@fb.com>
Date: Mon, 29 Nov 2021 19:06:21 -0800
Subject: selftests/bpf: Measure bpf_loop verifier performance

This patch tests bpf_loop in pyperf and strobemeta, and measures the
verifier performance of replacing the traditional for loop
with bpf_loop.

The results are as follows:

~strobemeta~

Baseline
    verification time 6808200 usec
    stack depth 496
    processed 554252 insns (limit 1000000) max_states_per_insn 16
    total_states 15878 peak_states 13489  mark_read 3110
    #192 verif_scale_strobemeta:OK (unrolled loop)

Using bpf_loop
    verification time 31589 usec
    stack depth 96+400
    processed 1513 insns (limit 1000000) max_states_per_insn 2
    total_states 106 peak_states 106 mark_read 60
    #193 verif_scale_strobemeta_bpf_loop:OK

~pyperf600~

Baseline
    verification time 29702486 usec
    stack depth 368
    processed 626838 insns (limit 1000000) max_states_per_insn 7
    total_states 30368 peak_states 30279 mark_read 748
    #182 verif_scale_pyperf600:OK (unrolled loop)

Using bpf_loop
    verification time 148488 usec
    stack depth 320+40
    processed 10518 insns (limit 1000000) max_states_per_insn 10
    total_states 705 peak_states 517 mark_read 38
    #183 verif_scale_pyperf600_bpf_loop:OK

Using the bpf_loop helper led to approximately a 99% decrease
in the verification time and in the number of instructions.

Signed-off-by: Joanne Koong <joannekoong@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211130030622.4131246-4-joannekoong@fb.com
---
 .../selftests/bpf/prog_tests/bpf_verif_scale.c     | 12 ++++
 tools/testing/selftests/bpf/progs/pyperf.h         | 71 +++++++++++++++++++-
 .../selftests/bpf/progs/pyperf600_bpf_loop.c       |  6 ++
 tools/testing/selftests/bpf/progs/strobemeta.h     | 75 +++++++++++++++++++++-
 .../selftests/bpf/progs/strobemeta_bpf_loop.c      |  9 +++
 5 files changed, 169 insertions(+), 4 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/progs/pyperf600_bpf_loop.c
 create mode 100644 tools/testing/selftests/bpf/progs/strobemeta_bpf_loop.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c
index 27f5d8ea7964..1fb16f8dad56 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c
@@ -115,6 +115,12 @@ void test_verif_scale_pyperf600()
 	scale_test("pyperf600.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
 }
 
+void test_verif_scale_pyperf600_bpf_loop(void)
+{
+	/* use the bpf_loop helper*/
+	scale_test("pyperf600_bpf_loop.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
+}
+
 void test_verif_scale_pyperf600_nounroll()
 {
 	/* no unroll at all.
@@ -165,6 +171,12 @@ void test_verif_scale_strobemeta()
 	scale_test("strobemeta.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
 }
 
+void test_verif_scale_strobemeta_bpf_loop(void)
+{
+	/* use the bpf_loop helper*/
+	scale_test("strobemeta_bpf_loop.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
+}
+
 void test_verif_scale_strobemeta_nounroll1()
 {
 	/* no unroll, tiny loops */
diff --git a/tools/testing/selftests/bpf/progs/pyperf.h b/tools/testing/selftests/bpf/progs/pyperf.h
index 2fb7adafb6b6..1ed28882daf3 100644
--- a/tools/testing/selftests/bpf/progs/pyperf.h
+++ b/tools/testing/selftests/bpf/progs/pyperf.h
@@ -159,6 +159,59 @@ struct {
 	__uint(value_size, sizeof(long long) * 127);
 } stackmap SEC(".maps");
 
+#ifdef USE_BPF_LOOP
+struct process_frame_ctx {
+	int cur_cpu;
+	int32_t *symbol_counter;
+	void *frame_ptr;
+	FrameData *frame;
+	PidData *pidData;
+	Symbol *sym;
+	Event *event;
+	bool done;
+};
+
+#define barrier_var(var) asm volatile("" : "=r"(var) : "0"(var))
+
+static int process_frame_callback(__u32 i, struct process_frame_ctx *ctx)
+{
+	int zero = 0;
+	void *frame_ptr = ctx->frame_ptr;
+	PidData *pidData = ctx->pidData;
+	FrameData *frame = ctx->frame;
+	int32_t *symbol_counter = ctx->symbol_counter;
+	int cur_cpu = ctx->cur_cpu;
+	Event *event = ctx->event;
+	Symbol *sym = ctx->sym;
+
+	if (frame_ptr && get_frame_data(frame_ptr, pidData, frame, sym)) {
+		int32_t new_symbol_id = *symbol_counter * 64 + cur_cpu;
+		int32_t *symbol_id = bpf_map_lookup_elem(&symbolmap, sym);
+
+		if (!symbol_id) {
+			bpf_map_update_elem(&symbolmap, sym, &zero, 0);
+			symbol_id = bpf_map_lookup_elem(&symbolmap, sym);
+			if (!symbol_id) {
+				ctx->done = true;
+				return 1;
+			}
+		}
+		if (*symbol_id == new_symbol_id)
+			(*symbol_counter)++;
+
+		barrier_var(i);
+		if (i >= STACK_MAX_LEN)
+			return 1;
+
+		event->stack[i] = *symbol_id;
+
+		event->stack_len = i + 1;
+		frame_ptr = frame->f_back;
+	}
+	return 0;
+}
+#endif /* USE_BPF_LOOP */
+
 #ifdef GLOBAL_FUNC
 __noinline
 #elif defined(SUBPROGS)
@@ -228,11 +281,26 @@ int __on_event(struct bpf_raw_tracepoint_args *ctx)
 		int32_t* symbol_counter = bpf_map_lookup_elem(&symbolmap, &sym);
 		if (symbol_counter == NULL)
 			return 0;
+#ifdef USE_BPF_LOOP
+	struct process_frame_ctx ctx = {
+		.cur_cpu = cur_cpu,
+		.symbol_counter = symbol_counter,
+		.frame_ptr = frame_ptr,
+		.frame = &frame,
+		.pidData = pidData,
+		.sym = &sym,
+		.event = event,
+	};
+
+	bpf_loop(STACK_MAX_LEN, process_frame_callback, &ctx, 0);
+	if (ctx.done)
+		return 0;
+#else
 #ifdef NO_UNROLL
 #pragma clang loop unroll(disable)
 #else
 #pragma clang loop unroll(full)
-#endif
+#endif /* NO_UNROLL */
 		/* Unwind python stack */
 		for (int i = 0; i < STACK_MAX_LEN; ++i) {
 			if (frame_ptr && get_frame_data(frame_ptr, pidData, &frame, &sym)) {
@@ -251,6 +319,7 @@ int __on_event(struct bpf_raw_tracepoint_args *ctx)
 				frame_ptr = frame.f_back;
 			}
 		}
+#endif /* USE_BPF_LOOP */
 		event->stack_complete = frame_ptr == NULL;
 	} else {
 		event->stack_complete = 1;
diff --git a/tools/testing/selftests/bpf/progs/pyperf600_bpf_loop.c b/tools/testing/selftests/bpf/progs/pyperf600_bpf_loop.c
new file mode 100644
index 000000000000..5c2059dc01af
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/pyperf600_bpf_loop.c
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+
+#define STACK_MAX_LEN 600
+#define USE_BPF_LOOP
+#include "pyperf.h"
diff --git a/tools/testing/selftests/bpf/progs/strobemeta.h b/tools/testing/selftests/bpf/progs/strobemeta.h
index 60c93aee2f4a..753718595c26 100644
--- a/tools/testing/selftests/bpf/progs/strobemeta.h
+++ b/tools/testing/selftests/bpf/progs/strobemeta.h
@@ -445,6 +445,48 @@ static __always_inline void *read_map_var(struct strobemeta_cfg *cfg,
 	return payload;
 }
 
+#ifdef USE_BPF_LOOP
+enum read_type {
+	READ_INT_VAR,
+	READ_MAP_VAR,
+	READ_STR_VAR,
+};
+
+struct read_var_ctx {
+	struct strobemeta_payload *data;
+	void *tls_base;
+	struct strobemeta_cfg *cfg;
+	void *payload;
+	/* value gets mutated */
+	struct strobe_value_generic *value;
+	enum read_type type;
+};
+
+static int read_var_callback(__u32 index, struct read_var_ctx *ctx)
+{
+	switch (ctx->type) {
+	case READ_INT_VAR:
+		if (index >= STROBE_MAX_INTS)
+			return 1;
+		read_int_var(ctx->cfg, index, ctx->tls_base, ctx->value, ctx->data);
+		break;
+	case READ_MAP_VAR:
+		if (index >= STROBE_MAX_MAPS)
+			return 1;
+		ctx->payload = read_map_var(ctx->cfg, index, ctx->tls_base,
+					    ctx->value, ctx->data, ctx->payload);
+		break;
+	case READ_STR_VAR:
+		if (index >= STROBE_MAX_STRS)
+			return 1;
+		ctx->payload += read_str_var(ctx->cfg, index, ctx->tls_base,
+					     ctx->value, ctx->data, ctx->payload);
+		break;
+	}
+	return 0;
+}
+#endif /* USE_BPF_LOOP */
+
 /*
  * read_strobe_meta returns NULL, if no metadata was read; otherwise returns
  * pointer to *right after* payload ends
@@ -475,11 +517,36 @@ static void *read_strobe_meta(struct task_struct *task,
 	 */
 	tls_base = (void *)task;
 
+#ifdef USE_BPF_LOOP
+	struct read_var_ctx ctx = {
+		.cfg = cfg,
+		.tls_base = tls_base,
+		.value = &value,
+		.data = data,
+		.payload = payload,
+	};
+	int err;
+
+	ctx.type = READ_INT_VAR;
+	err = bpf_loop(STROBE_MAX_INTS, read_var_callback, &ctx, 0);
+	if (err != STROBE_MAX_INTS)
+		return NULL;
+
+	ctx.type = READ_STR_VAR;
+	err = bpf_loop(STROBE_MAX_STRS, read_var_callback, &ctx, 0);
+	if (err != STROBE_MAX_STRS)
+		return NULL;
+
+	ctx.type = READ_MAP_VAR;
+	err = bpf_loop(STROBE_MAX_MAPS, read_var_callback, &ctx, 0);
+	if (err != STROBE_MAX_MAPS)
+		return NULL;
+#else
 #ifdef NO_UNROLL
 #pragma clang loop unroll(disable)
 #else
 #pragma unroll
-#endif
+#endif /* NO_UNROLL */
 	for (int i = 0; i < STROBE_MAX_INTS; ++i) {
 		read_int_var(cfg, i, tls_base, &value, data);
 	}
@@ -487,7 +554,7 @@ static void *read_strobe_meta(struct task_struct *task,
 #pragma clang loop unroll(disable)
 #else
 #pragma unroll
-#endif
+#endif /* NO_UNROLL */
 	for (int i = 0; i < STROBE_MAX_STRS; ++i) {
 		payload += read_str_var(cfg, i, tls_base, &value, data, payload);
 	}
@@ -495,10 +562,12 @@ static void *read_strobe_meta(struct task_struct *task,
 #pragma clang loop unroll(disable)
 #else
 #pragma unroll
-#endif
+#endif /* NO_UNROLL */
 	for (int i = 0; i < STROBE_MAX_MAPS; ++i) {
 		payload = read_map_var(cfg, i, tls_base, &value, data, payload);
 	}
+#endif /* USE_BPF_LOOP */
+
 	/*
 	 * return pointer right after end of payload, so it's possible to
 	 * calculate exact amount of useful data that needs to be sent
diff --git a/tools/testing/selftests/bpf/progs/strobemeta_bpf_loop.c b/tools/testing/selftests/bpf/progs/strobemeta_bpf_loop.c
new file mode 100644
index 000000000000..d18b992f0165
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/strobemeta_bpf_loop.c
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+/* Copyright (c) 2021 Facebook */
+
+#define STROBE_MAX_INTS 2
+#define STROBE_MAX_STRS 25
+#define STROBE_MAX_MAPS 100
+#define STROBE_MAX_MAP_ENTRIES 20
+#define USE_BPF_LOOP
+#include "strobemeta.h"
-- 
cgit v1.2.3


From ec151037af4f56065d5b258af82f13dbbf279ebd Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannekoong@fb.com>
Date: Mon, 29 Nov 2021 19:06:22 -0800
Subject: selftest/bpf/benchs: Add bpf_loop benchmark
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add benchmark to measure the throughput and latency of the bpf_loop
call.

Testing this on my dev machine on 1 thread, the data is as follows:

        nr_loops: 10
bpf_loop - throughput: 198.519 ± 0.155 M ops/s, latency: 5.037 ns/op

        nr_loops: 100
bpf_loop - throughput: 247.448 ± 0.305 M ops/s, latency: 4.041 ns/op

        nr_loops: 500
bpf_loop - throughput: 260.839 ± 0.380 M ops/s, latency: 3.834 ns/op

        nr_loops: 1000
bpf_loop - throughput: 262.806 ± 0.629 M ops/s, latency: 3.805 ns/op

        nr_loops: 5000
bpf_loop - throughput: 264.211 ± 1.508 M ops/s, latency: 3.785 ns/op

        nr_loops: 10000
bpf_loop - throughput: 265.366 ± 3.054 M ops/s, latency: 3.768 ns/op

        nr_loops: 50000
bpf_loop - throughput: 235.986 ± 20.205 M ops/s, latency: 4.238 ns/op

        nr_loops: 100000
bpf_loop - throughput: 264.482 ± 0.279 M ops/s, latency: 3.781 ns/op

        nr_loops: 500000
bpf_loop - throughput: 309.773 ± 87.713 M ops/s, latency: 3.228 ns/op

        nr_loops: 1000000
bpf_loop - throughput: 262.818 ± 4.143 M ops/s, latency: 3.805 ns/op

>From this data, we can see that the latency per loop decreases as the
number of loops increases. On this particular machine, each loop had an
overhead of about ~4 ns, and we were able to run ~250 million loops
per second.

Signed-off-by: Joanne Koong <joannekoong@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211130030622.4131246-5-joannekoong@fb.com
---
 tools/testing/selftests/bpf/Makefile               |   4 +-
 tools/testing/selftests/bpf/bench.c                |  37 ++++++++
 tools/testing/selftests/bpf/bench.h                |   2 +
 .../testing/selftests/bpf/benchs/bench_bpf_loop.c  | 105 +++++++++++++++++++++
 .../selftests/bpf/benchs/run_bench_bpf_loop.sh     |  15 +++
 tools/testing/selftests/bpf/benchs/run_common.sh   |  15 +++
 tools/testing/selftests/bpf/progs/bpf_loop_bench.c |  26 +++++
 7 files changed, 203 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/benchs/bench_bpf_loop.c
 create mode 100755 tools/testing/selftests/bpf/benchs/run_bench_bpf_loop.sh
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_loop_bench.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 35684d61aaeb..a6c0e92c86a1 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -531,6 +531,7 @@ $(OUTPUT)/bench_trigger.o: $(OUTPUT)/trigger_bench.skel.h
 $(OUTPUT)/bench_ringbufs.o: $(OUTPUT)/ringbuf_bench.skel.h \
 			    $(OUTPUT)/perfbuf_bench.skel.h
 $(OUTPUT)/bench_bloom_filter_map.o: $(OUTPUT)/bloom_filter_bench.skel.h
+$(OUTPUT)/bench_bpf_loop.o: $(OUTPUT)/bpf_loop_bench.skel.h
 $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ)
 $(OUTPUT)/bench: LDLIBS += -lm
 $(OUTPUT)/bench: $(OUTPUT)/bench.o \
@@ -540,7 +541,8 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \
 		 $(OUTPUT)/bench_rename.o \
 		 $(OUTPUT)/bench_trigger.o \
 		 $(OUTPUT)/bench_ringbufs.o \
-		 $(OUTPUT)/bench_bloom_filter_map.o
+		 $(OUTPUT)/bench_bloom_filter_map.o \
+		 $(OUTPUT)/bench_bpf_loop.o
 	$(call msg,BINARY,,$@)
 	$(Q)$(CC) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@
 
diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index c75e7ee28746..3d6082b97a56 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -134,6 +134,39 @@ void hits_drops_report_final(struct bench_res res[], int res_cnt)
 	       total_ops_mean, total_ops_stddev);
 }
 
+void ops_report_progress(int iter, struct bench_res *res, long delta_ns)
+{
+	double hits_per_sec, hits_per_prod;
+
+	hits_per_sec = res->hits / 1000000.0 / (delta_ns / 1000000000.0);
+	hits_per_prod = hits_per_sec / env.producer_cnt;
+
+	printf("Iter %3d (%7.3lfus): ", iter, (delta_ns - 1000000000) / 1000.0);
+
+	printf("hits %8.3lfM/s (%7.3lfM/prod)\n", hits_per_sec, hits_per_prod);
+}
+
+void ops_report_final(struct bench_res res[], int res_cnt)
+{
+	double hits_mean = 0.0, hits_stddev = 0.0;
+	int i;
+
+	for (i = 0; i < res_cnt; i++)
+		hits_mean += res[i].hits / 1000000.0 / (0.0 + res_cnt);
+
+	if (res_cnt > 1)  {
+		for (i = 0; i < res_cnt; i++)
+			hits_stddev += (hits_mean - res[i].hits / 1000000.0) *
+				       (hits_mean - res[i].hits / 1000000.0) /
+				       (res_cnt - 1.0);
+
+		hits_stddev = sqrt(hits_stddev);
+	}
+	printf("Summary: throughput %8.3lf \u00B1 %5.3lf M ops/s (%7.3lfM ops/prod), ",
+	       hits_mean, hits_stddev, hits_mean / env.producer_cnt);
+	printf("latency %8.3lf ns/op\n", 1000.0 / hits_mean * env.producer_cnt);
+}
+
 const char *argp_program_version = "benchmark";
 const char *argp_program_bug_address = "<bpf@vger.kernel.org>";
 const char argp_program_doc[] =
@@ -171,10 +204,12 @@ static const struct argp_option opts[] = {
 
 extern struct argp bench_ringbufs_argp;
 extern struct argp bench_bloom_map_argp;
+extern struct argp bench_bpf_loop_argp;
 
 static const struct argp_child bench_parsers[] = {
 	{ &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 },
 	{ &bench_bloom_map_argp, 0, "Bloom filter map benchmark", 0 },
+	{ &bench_bpf_loop_argp, 0, "bpf_loop helper benchmark", 0 },
 	{},
 };
 
@@ -373,6 +408,7 @@ extern const struct bench bench_bloom_update;
 extern const struct bench bench_bloom_false_positive;
 extern const struct bench bench_hashmap_without_bloom;
 extern const struct bench bench_hashmap_with_bloom;
+extern const struct bench bench_bpf_loop;
 
 static const struct bench *benchs[] = {
 	&bench_count_global,
@@ -404,6 +440,7 @@ static const struct bench *benchs[] = {
 	&bench_bloom_false_positive,
 	&bench_hashmap_without_bloom,
 	&bench_hashmap_with_bloom,
+	&bench_bpf_loop,
 };
 
 static void setup_benchmark()
diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h
index 624c6b11501f..50785503756b 100644
--- a/tools/testing/selftests/bpf/bench.h
+++ b/tools/testing/selftests/bpf/bench.h
@@ -59,6 +59,8 @@ void hits_drops_report_progress(int iter, struct bench_res *res, long delta_ns);
 void hits_drops_report_final(struct bench_res res[], int res_cnt);
 void false_hits_report_progress(int iter, struct bench_res *res, long delta_ns);
 void false_hits_report_final(struct bench_res res[], int res_cnt);
+void ops_report_progress(int iter, struct bench_res *res, long delta_ns);
+void ops_report_final(struct bench_res res[], int res_cnt);
 
 static inline __u64 get_time_ns() {
 	struct timespec t;
diff --git a/tools/testing/selftests/bpf/benchs/bench_bpf_loop.c b/tools/testing/selftests/bpf/benchs/bench_bpf_loop.c
new file mode 100644
index 000000000000..d0a6572bfab6
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/bench_bpf_loop.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+
+#include <argp.h>
+#include "bench.h"
+#include "bpf_loop_bench.skel.h"
+
+/* BPF triggering benchmarks */
+static struct ctx {
+	struct bpf_loop_bench *skel;
+} ctx;
+
+static struct {
+	__u32 nr_loops;
+} args = {
+	.nr_loops = 10,
+};
+
+enum {
+	ARG_NR_LOOPS = 4000,
+};
+
+static const struct argp_option opts[] = {
+	{ "nr_loops", ARG_NR_LOOPS, "nr_loops", 0,
+		"Set number of loops for the bpf_loop helper"},
+	{},
+};
+
+static error_t parse_arg(int key, char *arg, struct argp_state *state)
+{
+	switch (key) {
+	case ARG_NR_LOOPS:
+		args.nr_loops = strtol(arg, NULL, 10);
+		break;
+	default:
+		return ARGP_ERR_UNKNOWN;
+	}
+
+	return 0;
+}
+
+/* exported into benchmark runner */
+const struct argp bench_bpf_loop_argp = {
+	.options = opts,
+	.parser = parse_arg,
+};
+
+static void validate(void)
+{
+	if (env.consumer_cnt != 1) {
+		fprintf(stderr, "benchmark doesn't support multi-consumer!\n");
+		exit(1);
+	}
+}
+
+static void *producer(void *input)
+{
+	while (true)
+		/* trigger the bpf program */
+		syscall(__NR_getpgid);
+
+	return NULL;
+}
+
+static void *consumer(void *input)
+{
+	return NULL;
+}
+
+static void measure(struct bench_res *res)
+{
+	res->hits = atomic_swap(&ctx.skel->bss->hits, 0);
+}
+
+static void setup(void)
+{
+	struct bpf_link *link;
+
+	setup_libbpf();
+
+	ctx.skel = bpf_loop_bench__open_and_load();
+	if (!ctx.skel) {
+		fprintf(stderr, "failed to open skeleton\n");
+		exit(1);
+	}
+
+	link = bpf_program__attach(ctx.skel->progs.benchmark);
+	if (!link) {
+		fprintf(stderr, "failed to attach program!\n");
+		exit(1);
+	}
+
+	ctx.skel->bss->nr_loops = args.nr_loops;
+}
+
+const struct bench bench_bpf_loop = {
+	.name = "bpf-loop",
+	.validate = validate,
+	.setup = setup,
+	.producer_thread = producer,
+	.consumer_thread = consumer,
+	.measure = measure,
+	.report_progress = ops_report_progress,
+	.report_final = ops_report_final,
+};
diff --git a/tools/testing/selftests/bpf/benchs/run_bench_bpf_loop.sh b/tools/testing/selftests/bpf/benchs/run_bench_bpf_loop.sh
new file mode 100755
index 000000000000..d4f5f73b356b
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/run_bench_bpf_loop.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source ./benchs/run_common.sh
+
+set -eufo pipefail
+
+for t in 1 4 8 12 16; do
+for i in 10 100 500 1000 5000 10000 50000 100000 500000 1000000; do
+subtitle "nr_loops: $i, nr_threads: $t"
+	summarize_ops "bpf_loop: " \
+	    "$($RUN_BENCH -p $t --nr_loops $i bpf-loop)"
+	printf "\n"
+done
+done
diff --git a/tools/testing/selftests/bpf/benchs/run_common.sh b/tools/testing/selftests/bpf/benchs/run_common.sh
index 9a16be78b180..6c5e6023a69f 100644
--- a/tools/testing/selftests/bpf/benchs/run_common.sh
+++ b/tools/testing/selftests/bpf/benchs/run_common.sh
@@ -33,6 +33,14 @@ function percentage()
 	echo "$*" | sed -E "s/.*Percentage\s=\s+([0-9]+\.[0-9]+).*/\1/"
 }
 
+function ops()
+{
+	echo -n "throughput: "
+	echo -n "$*" | sed -E "s/.*throughput\s+([0-9]+\.[0-9]+ ± [0-9]+\.[0-9]+\sM\sops\/s).*/\1/"
+	echo -n -e ", latency: "
+	echo "$*" | sed -E "s/.*latency\s+([0-9]+\.[0-9]+\sns\/op).*/\1/"
+}
+
 function total()
 {
 	echo "$*" | sed -E "s/.*total operations\s+([0-9]+\.[0-9]+ ± [0-9]+\.[0-9]+M\/s).*/\1/"
@@ -52,6 +60,13 @@ function summarize_percentage()
 	printf "%-20s %s%%\n" "$bench" "$(percentage $summary)"
 }
 
+function summarize_ops()
+{
+	bench="$1"
+	summary=$(echo $2 | tail -n1)
+	printf "%-20s %s\n" "$bench" "$(ops $summary)"
+}
+
 function summarize_total()
 {
 	bench="$1"
diff --git a/tools/testing/selftests/bpf/progs/bpf_loop_bench.c b/tools/testing/selftests/bpf/progs/bpf_loop_bench.c
new file mode 100644
index 000000000000..9dafdc244462
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_loop_bench.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+u32 nr_loops;
+long hits;
+
+static int empty_callback(__u32 index, void *data)
+{
+	return 0;
+}
+
+SEC("fentry/__x64_sys_getpgid")
+int benchmark(void *ctx)
+{
+	for (int i = 0; i < 1000; i++) {
+		bpf_loop(nr_loops, empty_callback, NULL, 0);
+
+		__sync_add_and_fetch(&hits, nr_loops);
+	}
+	return 0;
+}
-- 
cgit v1.2.3


From 0270090d396a8e7e7f42adae13fdfa48ffb85144 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Tue, 23 Nov 2021 05:27:32 +0530
Subject: libbpf: Avoid double stores for success/failure case of ksym
 relocations

Instead, jump directly to success case stores in case ret >= 0, else do
the default 0 value store and jump over the success case. This is better
in terms of readability. Readjust the code for kfunc relocation as well
to follow a similar pattern, also leads to easier to follow code now.

Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20211122235733.634914-3-memxor@gmail.com
---
 tools/lib/bpf/gen_loader.c | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c
index c7bc77f4e752..5a2d6bf041dd 100644
--- a/tools/lib/bpf/gen_loader.c
+++ b/tools/lib/bpf/gen_loader.c
@@ -674,27 +674,29 @@ static void emit_relo_kfunc_btf(struct bpf_gen *gen, struct ksym_relo_desc *relo
 		return;
 	}
 	kdesc->off = btf_fd_idx;
-	/* set a default value for imm */
+	/* jump to success case */
+	emit(gen, BPF_JMP_IMM(BPF_JSGE, BPF_REG_7, 0, 3));
+	/* set value for imm, off as 0 */
 	emit(gen, BPF_ST_MEM(BPF_W, BPF_REG_8, offsetof(struct bpf_insn, imm), 0));
-	/* skip success case store if ret < 0 */
-	emit(gen, BPF_JMP_IMM(BPF_JSLT, BPF_REG_7, 0, 1));
+	emit(gen, BPF_ST_MEM(BPF_H, BPF_REG_8, offsetof(struct bpf_insn, off), 0));
+	/* skip success case for ret < 0 */
+	emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 10));
 	/* store btf_id into insn[insn_idx].imm */
 	emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_8, BPF_REG_7, offsetof(struct bpf_insn, imm)));
+	/* obtain fd in BPF_REG_9 */
+	emit(gen, BPF_MOV64_REG(BPF_REG_9, BPF_REG_7));
+	emit(gen, BPF_ALU64_IMM(BPF_RSH, BPF_REG_9, 32));
+	/* jump to fd_array store if fd denotes module BTF */
+	emit(gen, BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 0, 2));
+	/* set the default value for off */
+	emit(gen, BPF_ST_MEM(BPF_H, BPF_REG_8, offsetof(struct bpf_insn, off), 0));
+	/* skip BTF fd store for vmlinux BTF */
+	emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 4));
 	/* load fd_array slot pointer */
 	emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_0, BPF_PSEUDO_MAP_IDX_VALUE,
 					 0, 0, 0, blob_fd_array_off(gen, btf_fd_idx)));
-	/* skip store of BTF fd if ret < 0 */
-	emit(gen, BPF_JMP_IMM(BPF_JSLT, BPF_REG_7, 0, 3));
 	/* store BTF fd in slot */
-	emit(gen, BPF_MOV64_REG(BPF_REG_9, BPF_REG_7));
-	emit(gen, BPF_ALU64_IMM(BPF_RSH, BPF_REG_9, 32));
 	emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_9, 0));
-	/* set a default value for off */
-	emit(gen, BPF_ST_MEM(BPF_H, BPF_REG_8, offsetof(struct bpf_insn, off), 0));
-	/* skip insn->off store if ret < 0 */
-	emit(gen, BPF_JMP_IMM(BPF_JSLT, BPF_REG_7, 0, 2));
-	/* skip if vmlinux BTF */
-	emit(gen, BPF_JMP_IMM(BPF_JEQ, BPF_REG_9, 0, 1));
 	/* store index into insn[insn_idx].off */
 	emit(gen, BPF_ST_MEM(BPF_H, BPF_REG_8, offsetof(struct bpf_insn, off), btf_fd_idx));
 log:
@@ -803,17 +805,20 @@ static void emit_relo_ksym_btf(struct bpf_gen *gen, struct ksym_relo_desc *relo,
 	emit_bpf_find_by_name_kind(gen, relo);
 	if (!relo->is_weak)
 		emit_check_err(gen);
-	/* set default values as 0 */
+	/* jump to success case */
+	emit(gen, BPF_JMP_IMM(BPF_JSGE, BPF_REG_7, 0, 3));
+	/* set values for insn[insn_idx].imm, insn[insn_idx + 1].imm as 0 */
 	emit(gen, BPF_ST_MEM(BPF_W, BPF_REG_8, offsetof(struct bpf_insn, imm), 0));
 	emit(gen, BPF_ST_MEM(BPF_W, BPF_REG_8, sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm), 0));
-	/* skip success case stores if ret < 0 */
-	emit(gen, BPF_JMP_IMM(BPF_JSLT, BPF_REG_7, 0, 4));
+	/* skip success case for ret < 0 */
+	emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 4));
 	/* store btf_id into insn[insn_idx].imm */
 	emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_8, BPF_REG_7, offsetof(struct bpf_insn, imm)));
 	/* store btf_obj_fd into insn[insn_idx + 1].imm */
 	emit(gen, BPF_ALU64_IMM(BPF_RSH, BPF_REG_7, 32));
 	emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_8, BPF_REG_7,
 			      sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm)));
+	/* skip src_reg adjustment */
 	emit(gen, BPF_JMP_IMM(BPF_JSGE, BPF_REG_7, 0, 3));
 clear_src_reg:
 	/* clear bpf_object__relocate_data's src_reg assignment, otherwise we get a verifier failure */
-- 
cgit v1.2.3


From d995816b77eb826e0f6d7adf4471ec191b362be0 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Tue, 23 Nov 2021 05:27:33 +0530
Subject: libbpf: Avoid reload of imm for weak, unresolved, repeating ksym

Alexei pointed out that we can use BPF_REG_0 which already contains imm
from move_blob2blob computation. Note that we now compare the second
insn's imm, but this should not matter, since both will be zeroed out
for the error case for the insn populated earlier.

Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20211122235733.634914-4-memxor@gmail.com
---
 tools/lib/bpf/gen_loader.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c
index 5a2d6bf041dd..6f3790369463 100644
--- a/tools/lib/bpf/gen_loader.c
+++ b/tools/lib/bpf/gen_loader.c
@@ -795,9 +795,8 @@ static void emit_relo_ksym_btf(struct bpf_gen *gen, struct ksym_relo_desc *relo,
 			       kdesc->insn + offsetof(struct bpf_insn, imm));
 		move_blob2blob(gen, insn + sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm), 4,
 			       kdesc->insn + sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm));
-		emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_8, offsetof(struct bpf_insn, imm)));
-		/* jump over src_reg adjustment if imm is not 0 */
-		emit(gen, BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 0, 3));
+		/* jump over src_reg adjustment if imm is not 0, reuse BPF_REG_0 from move_blob2blob */
+		emit(gen, BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 3));
 		goto clear_src_reg;
 	}
 	/* remember insn offset, so we can copy BTF ID and FD later */
-- 
cgit v1.2.3


From 8c0abfd6d2f6b0221194241ac2908751a2a0385f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 17 Nov 2021 18:57:01 -0800
Subject: rcutorture: Add CONFIG_PREEMPT_DYNAMIC=n to tiny scenarios

With CONFIG_PREEMPT_DYNAMIC=y, the kernel builds with CONFIG_PREEMPTION=y
because preemption can be enabled at runtime.  This prevents any tests
of Tiny RCU or Tiny SRCU from running correctly.  This commit therefore
explicitly sets CONFIG_PREEMPT_DYNAMIC=n for those scenarios.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/configs/rcu/SRCU-T    | 1 +
 tools/testing/selftests/rcutorture/configs/rcu/SRCU-U    | 1 +
 tools/testing/selftests/rcutorture/configs/rcu/TINY01    | 1 +
 tools/testing/selftests/rcutorture/configs/rcu/TINY02    | 1 +
 tools/testing/selftests/rcutorture/configs/rcuscale/TINY | 1 +
 5 files changed, 5 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-T b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-T
index d6557c38dfe4..c70cf0405f24 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-T
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-T
@@ -2,6 +2,7 @@ CONFIG_SMP=n
 CONFIG_PREEMPT_NONE=y
 CONFIG_PREEMPT_VOLUNTARY=n
 CONFIG_PREEMPT=n
+CONFIG_PREEMPT_DYNAMIC=n
 #CHECK#CONFIG_TINY_SRCU=y
 CONFIG_RCU_TRACE=n
 CONFIG_DEBUG_LOCK_ALLOC=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-U b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-U
index 6bc24e99862f..bc9eeabaa1b1 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-U
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-U
@@ -2,6 +2,7 @@ CONFIG_SMP=n
 CONFIG_PREEMPT_NONE=y
 CONFIG_PREEMPT_VOLUNTARY=n
 CONFIG_PREEMPT=n
+CONFIG_PREEMPT_DYNAMIC=n
 #CHECK#CONFIG_TINY_SRCU=y
 CONFIG_RCU_TRACE=n
 CONFIG_DEBUG_LOCK_ALLOC=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TINY01 b/tools/testing/selftests/rcutorture/configs/rcu/TINY01
index 6db705e55487..0953c52fcfd7 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TINY01
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TINY01
@@ -2,6 +2,7 @@ CONFIG_SMP=n
 CONFIG_PREEMPT_NONE=y
 CONFIG_PREEMPT_VOLUNTARY=n
 CONFIG_PREEMPT=n
+CONFIG_PREEMPT_DYNAMIC=n
 #CHECK#CONFIG_TINY_RCU=y
 CONFIG_HZ_PERIODIC=n
 CONFIG_NO_HZ_IDLE=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TINY02 b/tools/testing/selftests/rcutorture/configs/rcu/TINY02
index d8674264318d..30439f6fc20e 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TINY02
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TINY02
@@ -2,6 +2,7 @@ CONFIG_SMP=n
 CONFIG_PREEMPT_NONE=y
 CONFIG_PREEMPT_VOLUNTARY=n
 CONFIG_PREEMPT=n
+CONFIG_PREEMPT_DYNAMIC=n
 #CHECK#CONFIG_TINY_RCU=y
 CONFIG_HZ_PERIODIC=y
 CONFIG_NO_HZ_IDLE=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/TINY b/tools/testing/selftests/rcutorture/configs/rcuscale/TINY
index fb05ef5279b4..64e9490c1f79 100644
--- a/tools/testing/selftests/rcutorture/configs/rcuscale/TINY
+++ b/tools/testing/selftests/rcutorture/configs/rcuscale/TINY
@@ -2,6 +2,7 @@ CONFIG_SMP=n
 CONFIG_PREEMPT_NONE=y
 CONFIG_PREEMPT_VOLUNTARY=n
 CONFIG_PREEMPT=n
+CONFIG_PREEMPT_DYNAMIC=n
 #CHECK#CONFIG_TINY_RCU=y
 CONFIG_HZ_PERIODIC=n
 CONFIG_NO_HZ_IDLE=y
-- 
cgit v1.2.3


From f04cbe651b4e71c4477e81482491aadfd062d878 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 27 Sep 2021 14:13:24 -0700
Subject: torture: Remove RCU_FAST_NO_HZ from rcuscale and refscale scenarios

All of the rcuscale and refscale scenarios that mention the Kconfig option
CONFIG_RCU_FAST_NO_HZ disable it.  But this Kconfig option is disabled by
default, so this commit removes the pointless "CONFIG_RCU_FAST_NO_HZ=n"
lines from these scenarios.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/configs/rcuscale/TINY      | 1 -
 tools/testing/selftests/rcutorture/configs/rcuscale/TRACE01   | 1 -
 tools/testing/selftests/rcutorture/configs/rcuscale/TREE      | 1 -
 tools/testing/selftests/rcutorture/configs/rcuscale/TREE54    | 1 -
 tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT | 1 -
 tools/testing/selftests/rcutorture/configs/refscale/PREEMPT   | 1 -
 6 files changed, 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/TINY b/tools/testing/selftests/rcutorture/configs/rcuscale/TINY
index 64e9490c1f79..0fa2dc086e10 100644
--- a/tools/testing/selftests/rcutorture/configs/rcuscale/TINY
+++ b/tools/testing/selftests/rcutorture/configs/rcuscale/TINY
@@ -7,7 +7,6 @@ CONFIG_PREEMPT_DYNAMIC=n
 CONFIG_HZ_PERIODIC=n
 CONFIG_NO_HZ_IDLE=y
 CONFIG_NO_HZ_FULL=n
-CONFIG_RCU_FAST_NO_HZ=n
 CONFIG_RCU_NOCB_CPU=n
 CONFIG_DEBUG_LOCK_ALLOC=n
 CONFIG_PROVE_LOCKING=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/TRACE01 b/tools/testing/selftests/rcutorture/configs/rcuscale/TRACE01
index e6baa2fbaeb3..227aba7783af 100644
--- a/tools/testing/selftests/rcutorture/configs/rcuscale/TRACE01
+++ b/tools/testing/selftests/rcutorture/configs/rcuscale/TRACE01
@@ -5,7 +5,6 @@ CONFIG_PREEMPT=n
 CONFIG_HZ_PERIODIC=n
 CONFIG_NO_HZ_IDLE=y
 CONFIG_NO_HZ_FULL=n
-CONFIG_RCU_FAST_NO_HZ=n
 CONFIG_RCU_NOCB_CPU=n
 CONFIG_DEBUG_LOCK_ALLOC=n
 CONFIG_PROVE_LOCKING=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/TREE b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE
index 4cc1cc581321..f110d9ffbe4c 100644
--- a/tools/testing/selftests/rcutorture/configs/rcuscale/TREE
+++ b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE
@@ -6,7 +6,6 @@ CONFIG_PREEMPT=y
 CONFIG_HZ_PERIODIC=n
 CONFIG_NO_HZ_IDLE=y
 CONFIG_NO_HZ_FULL=n
-CONFIG_RCU_FAST_NO_HZ=n
 CONFIG_HOTPLUG_CPU=y
 CONFIG_SUSPEND=n
 CONFIG_HIBERNATION=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/TREE54 b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE54
index f5952061fde7..9f83e5372796 100644
--- a/tools/testing/selftests/rcutorture/configs/rcuscale/TREE54
+++ b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE54
@@ -7,7 +7,6 @@ CONFIG_PREEMPT=y
 CONFIG_HZ_PERIODIC=n
 CONFIG_NO_HZ_IDLE=y
 CONFIG_NO_HZ_FULL=n
-CONFIG_RCU_FAST_NO_HZ=n
 CONFIG_HOTPLUG_CPU=y
 CONFIG_SUSPEND=n
 CONFIG_HIBERNATION=n
diff --git a/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT b/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT
index ad505a887bec..7f06838a91e6 100644
--- a/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT
+++ b/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT
@@ -6,7 +6,6 @@ CONFIG_PREEMPT=n
 CONFIG_HZ_PERIODIC=n
 CONFIG_NO_HZ_IDLE=y
 CONFIG_NO_HZ_FULL=n
-CONFIG_RCU_FAST_NO_HZ=n
 CONFIG_HOTPLUG_CPU=y
 CONFIG_SUSPEND=n
 CONFIG_HIBERNATION=n
diff --git a/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT b/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT
index 4f08e641bb6b..52e3ef674056 100644
--- a/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT
+++ b/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT
@@ -6,7 +6,6 @@ CONFIG_PREEMPT=y
 CONFIG_HZ_PERIODIC=n
 CONFIG_NO_HZ_IDLE=y
 CONFIG_NO_HZ_FULL=n
-CONFIG_RCU_FAST_NO_HZ=n
 CONFIG_HOTPLUG_CPU=y
 CONFIG_SUSPEND=n
 CONFIG_HIBERNATION=n
-- 
cgit v1.2.3


From 24eab6e1ff58d33048c36b5452f5d284c3710d02 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 27 Sep 2021 14:17:35 -0700
Subject: torture: Remove RCU_FAST_NO_HZ from rcu scenarios

All of the rcu scenarios that mentioning CONFIG_RCU_FAST_NO_HZ disable it.
But this Kconfig option is disabled by default, so this commit removes
the pointless "CONFIG_RCU_FAST_NO_HZ=n" lines from these scenarios.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/configs/rcu/TREE02 | 1 -
 tools/testing/selftests/rcutorture/configs/rcu/TREE05 | 1 -
 tools/testing/selftests/rcutorture/configs/rcu/TREE06 | 1 -
 tools/testing/selftests/rcutorture/configs/rcu/TREE07 | 1 -
 tools/testing/selftests/rcutorture/configs/rcu/TREE08 | 1 -
 tools/testing/selftests/rcutorture/configs/rcu/TREE10 | 1 -
 6 files changed, 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE02 b/tools/testing/selftests/rcutorture/configs/rcu/TREE02
index 65daee4fbf5a..2871ee599891 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE02
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE02
@@ -7,7 +7,6 @@ CONFIG_PREEMPT=y
 CONFIG_HZ_PERIODIC=n
 CONFIG_NO_HZ_IDLE=y
 CONFIG_NO_HZ_FULL=n
-CONFIG_RCU_FAST_NO_HZ=n
 CONFIG_RCU_TRACE=n
 CONFIG_RCU_FANOUT=3
 CONFIG_RCU_FANOUT_LEAF=3
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE05 b/tools/testing/selftests/rcutorture/configs/rcu/TREE05
index 4f95f8544f3f..9f48c73709ec 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE05
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE05
@@ -7,7 +7,6 @@ CONFIG_PREEMPT=n
 CONFIG_HZ_PERIODIC=n
 CONFIG_NO_HZ_IDLE=y
 CONFIG_NO_HZ_FULL=n
-CONFIG_RCU_FAST_NO_HZ=n
 CONFIG_RCU_TRACE=n
 CONFIG_HOTPLUG_CPU=y
 CONFIG_RCU_FANOUT=6
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE06 b/tools/testing/selftests/rcutorture/configs/rcu/TREE06
index bf4980d606b5..db27651de04b 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE06
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE06
@@ -7,7 +7,6 @@ CONFIG_PREEMPT=n
 CONFIG_HZ_PERIODIC=n
 CONFIG_NO_HZ_IDLE=y
 CONFIG_NO_HZ_FULL=n
-CONFIG_RCU_FAST_NO_HZ=n
 CONFIG_RCU_TRACE=n
 CONFIG_RCU_FANOUT=6
 CONFIG_RCU_FANOUT_LEAF=6
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE07 b/tools/testing/selftests/rcutorture/configs/rcu/TREE07
index d7afb271a586..2789b47e4ecd 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE07
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE07
@@ -7,7 +7,6 @@ CONFIG_PREEMPT=n
 CONFIG_HZ_PERIODIC=n
 CONFIG_NO_HZ_IDLE=n
 CONFIG_NO_HZ_FULL=y
-CONFIG_RCU_FAST_NO_HZ=n
 CONFIG_RCU_TRACE=y
 CONFIG_HOTPLUG_CPU=y
 CONFIG_RCU_FANOUT=2
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08 b/tools/testing/selftests/rcutorture/configs/rcu/TREE08
index c810c5276a89..8b561355b9ef 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE08
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE08
@@ -7,7 +7,6 @@ CONFIG_PREEMPT=y
 CONFIG_HZ_PERIODIC=n
 CONFIG_NO_HZ_IDLE=y
 CONFIG_NO_HZ_FULL=n
-CONFIG_RCU_FAST_NO_HZ=n
 CONFIG_RCU_TRACE=n
 CONFIG_RCU_FANOUT=3
 CONFIG_RCU_FANOUT_LEAF=2
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE10 b/tools/testing/selftests/rcutorture/configs/rcu/TREE10
index 7311f84a5876..4a00539bfdd7 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE10
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE10
@@ -7,7 +7,6 @@ CONFIG_PREEMPT=n
 CONFIG_HZ_PERIODIC=n
 CONFIG_NO_HZ_IDLE=y
 CONFIG_NO_HZ_FULL=n
-CONFIG_RCU_FAST_NO_HZ=n
 CONFIG_RCU_TRACE=n
 CONFIG_RCU_NOCB_CPU=n
 CONFIG_DEBUG_LOCK_ALLOC=n
-- 
cgit v1.2.3


From e2c73a6860bdf54f2c6bf8cddc34ddc91a1343e1 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 27 Sep 2021 14:18:51 -0700
Subject: rcu: Remove the RCU_FAST_NO_HZ Kconfig option

All of the uses of CONFIG_RCU_FAST_NO_HZ=y that I have seen involve
systems with RCU callbacks offloaded.  In this situation, all that this
Kconfig option does is slow down idle entry/exit with an additional
allways-taken early exit.  If this is the only use case, then this
Kconfig option nothing but an attractive nuisance that needs to go away.

This commit therefore removes the RCU_FAST_NO_HZ Kconfig option.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/stallwarn.rst                    |  11 --
 Documentation/admin-guide/kernel-parameters.txt    |   4 -
 Documentation/timers/no_hz.rst                     |  10 +-
 kernel/rcu/Kconfig                                 |  18 --
 kernel/rcu/tree.c                                  |  11 --
 kernel/rcu/tree.h                                  |   7 -
 kernel/rcu/tree_plugin.h                           | 185 +--------------------
 kernel/rcu/tree_stall.h                            |  27 +--
 .../selftests/rcutorture/configs/rcu/TREE01        |   1 -
 .../selftests/rcutorture/configs/rcu/TREE04        |   1 -
 .../selftests/rcutorture/doc/TREE_RCU-kconfig.txt  |   1 -
 11 files changed, 7 insertions(+), 269 deletions(-)

(limited to 'tools')

diff --git a/Documentation/RCU/stallwarn.rst b/Documentation/RCU/stallwarn.rst
index 28f8ad16db25..78404625bad2 100644
--- a/Documentation/RCU/stallwarn.rst
+++ b/Documentation/RCU/stallwarn.rst
@@ -254,17 +254,6 @@ period (in this case 2603), the grace-period sequence number (7075), and
 an estimate of the total number of RCU callbacks queued across all CPUs
 (625 in this case).
 
-In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed
-for each CPU::
-
-	0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 dyntick_enabled: 1
-
-The "last_accelerate:" prints the low-order 16 bits (in hex) of the
-jiffies counter when this CPU last invoked rcu_try_advance_all_cbs()
-from rcu_needs_cpu() or last invoked rcu_accelerate_cbs() from
-rcu_prepare_for_idle(). "dyntick_enabled: 1" indicates that dyntick-idle
-processing is enabled.
-
 If the grace period ends just as the stall warning starts printing,
 there will be a spurious stall-warning message, which will include
 the following::
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 9725c546a0d4..36b83a47e54a 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4489,10 +4489,6 @@
 			on rcutree.qhimark at boot time and to zero to
 			disable more aggressive help enlistment.
 
-	rcutree.rcu_idle_gp_delay= [KNL]
-			Set wakeup interval for idle CPUs that have
-			RCU callbacks (RCU_FAST_NO_HZ=y).
-
 	rcutree.rcu_kick_kthreads= [KNL]
 			Cause the grace-period kthread to get an extra
 			wake_up() if it sleeps three times longer than
diff --git a/Documentation/timers/no_hz.rst b/Documentation/timers/no_hz.rst
index 20ad23a6c618..f8786be15183 100644
--- a/Documentation/timers/no_hz.rst
+++ b/Documentation/timers/no_hz.rst
@@ -184,16 +184,12 @@ There are situations in which idle CPUs cannot be permitted to
 enter either dyntick-idle mode or adaptive-tick mode, the most
 common being when that CPU has RCU callbacks pending.
 
-The CONFIG_RCU_FAST_NO_HZ=y Kconfig option may be used to cause such CPUs
-to enter dyntick-idle mode or adaptive-tick mode anyway.  In this case,
-a timer will awaken these CPUs every four jiffies in order to ensure
-that the RCU callbacks are processed in a timely fashion.
-
-Another approach is to offload RCU callback processing to "rcuo" kthreads
+Avoid this by offloading RCU callback processing to "rcuo" kthreads
 using the CONFIG_RCU_NOCB_CPU=y Kconfig option.  The specific CPUs to
 offload may be selected using The "rcu_nocbs=" kernel boot parameter,
 which takes a comma-separated list of CPUs and CPU ranges, for example,
-"1,3-5" selects CPUs 1, 3, 4, and 5.
+"1,3-5" selects CPUs 1, 3, 4, and 5.  Note that CPUs specified by
+the "nohz_full" kernel boot parameter are also offloaded.
 
 The offloaded CPUs will never queue RCU callbacks, and therefore RCU
 never prevents offloaded CPUs from entering either dyntick-idle mode
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 3128b7cf8e1f..4005f2728f1a 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -169,24 +169,6 @@ config RCU_FANOUT_LEAF
 
 	  Take the default if unsure.
 
-config RCU_FAST_NO_HZ
-	bool "Accelerate last non-dyntick-idle CPU's grace periods"
-	depends on NO_HZ_COMMON && SMP && RCU_EXPERT
-	default n
-	help
-	  This option permits CPUs to enter dynticks-idle state even if
-	  they have RCU callbacks queued, and prevents RCU from waking
-	  these CPUs up more than roughly once every four jiffies (by
-	  default, you can adjust this using the rcutree.rcu_idle_gp_delay
-	  parameter), thus improving energy efficiency.  On the other
-	  hand, this option increases the duration of RCU grace periods,
-	  for example, slowing down synchronize_rcu().
-
-	  Say Y if energy efficiency is critically important, and you
-	  	don't care about increased grace-period durations.
-
-	  Say N if you are unsure.
-
 config RCU_BOOST
 	bool "Enable RCU priority boosting"
 	depends on (RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT) || PREEMPT_RT
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index ef8d36f580fc..5e98257c2910 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -624,7 +624,6 @@ static noinstr void rcu_eqs_enter(bool user)
 	instrumentation_begin();
 	trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
-	rcu_prepare_for_idle();
 	rcu_preempt_deferred_qs(current);
 
 	// instrumentation for the noinstr rcu_dynticks_eqs_enter()
@@ -768,9 +767,6 @@ noinstr void rcu_nmi_exit(void)
 	trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks));
 	WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
 
-	if (!in_nmi())
-		rcu_prepare_for_idle();
-
 	// instrumentation for the noinstr rcu_dynticks_eqs_enter()
 	instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
 	instrumentation_end();
@@ -872,7 +868,6 @@ static void noinstr rcu_eqs_exit(bool user)
 	// instrumentation for the noinstr rcu_dynticks_eqs_exit()
 	instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
 
-	rcu_cleanup_after_idle();
 	trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks));
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
 	WRITE_ONCE(rdp->dynticks_nesting, 1);
@@ -1014,12 +1009,6 @@ noinstr void rcu_nmi_enter(void)
 		rcu_dynticks_eqs_exit();
 		// ... but is watching here.
 
-		if (!in_nmi()) {
-			instrumentation_begin();
-			rcu_cleanup_after_idle();
-			instrumentation_end();
-		}
-
 		instrumentation_begin();
 		// instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
 		instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks));
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 305cf6aeb408..7ca1aa46083c 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -189,11 +189,6 @@ struct rcu_data {
 	bool rcu_urgent_qs;		/* GP old need light quiescent state. */
 	bool rcu_forced_tick;		/* Forced tick to provide QS. */
 	bool rcu_forced_tick_exp;	/*   ... provide QS to expedited GP. */
-#ifdef CONFIG_RCU_FAST_NO_HZ
-	unsigned long last_accelerate;	/* Last jiffy CBs were accelerated. */
-	unsigned long last_advance_all;	/* Last jiffy CBs were all advanced. */
-	int tick_nohz_enabled_snap;	/* Previously seen value from sysfs. */
-#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
 
 	/* 4) rcu_barrier(), OOM callbacks, and expediting. */
 	struct rcu_head barrier_head;
@@ -419,8 +414,6 @@ static bool rcu_is_callbacks_kthread(void);
 static void rcu_cpu_kthread_setup(unsigned int cpu);
 static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp);
 static void __init rcu_spawn_boost_kthreads(void);
-static void rcu_cleanup_after_idle(void);
-static void rcu_prepare_for_idle(void);
 static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
 static bool rcu_preempt_need_deferred_qs(struct task_struct *t);
 static void rcu_preempt_deferred_qs(struct task_struct *t);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 5199559fbbf0..19f7d578cedb 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -51,8 +51,6 @@ static void __init rcu_bootup_announce_oddness(void)
 			RCU_FANOUT);
 	if (rcu_fanout_exact)
 		pr_info("\tHierarchical RCU autobalancing is disabled.\n");
-	if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ))
-		pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
 	if (IS_ENABLED(CONFIG_PROVE_RCU))
 		pr_info("\tRCU lockdep checking is enabled.\n");
 	if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
@@ -1253,16 +1251,14 @@ static void __init rcu_spawn_boost_kthreads(void)
 
 #endif /* #else #ifdef CONFIG_RCU_BOOST */
 
-#if !defined(CONFIG_RCU_FAST_NO_HZ)
-
 /*
  * Check to see if any future non-offloaded RCU-related work will need
  * to be done by the current CPU, even if none need be done immediately,
  * returning 1 if so.  This function is part of the RCU implementation;
  * it is -not- an exported member of the RCU API.
  *
- * Because we not have RCU_FAST_NO_HZ, just check whether or not this
- * CPU has RCU callbacks queued.
+ * Just check whether or not this CPU has non-offloaded RCU callbacks
+ * queued.
  */
 int rcu_needs_cpu(u64 basemono, u64 *nextevt)
 {
@@ -1271,183 +1267,6 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
 		!rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data));
 }
 
-/*
- * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
- * after it.
- */
-static void rcu_cleanup_after_idle(void)
-{
-}
-
-/*
- * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n,
- * is nothing.
- */
-static void rcu_prepare_for_idle(void)
-{
-}
-
-#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-
-/*
- * This code is invoked when a CPU goes idle, at which point we want
- * to have the CPU do everything required for RCU so that it can enter
- * the energy-efficient dyntick-idle mode.
- *
- * The following preprocessor symbol controls this:
- *
- * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
- *	to sleep in dyntick-idle mode with RCU callbacks pending.  This
- *	is sized to be roughly one RCU grace period.  Those energy-efficiency
- *	benchmarkers who might otherwise be tempted to set this to a large
- *	number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
- *	system.  And if you are -that- concerned about energy efficiency,
- *	just power the system down and be done with it!
- *
- * The value below works well in practice.  If future workloads require
- * adjustment, they can be converted into kernel config parameters, though
- * making the state machine smarter might be a better option.
- */
-#define RCU_IDLE_GP_DELAY 4		/* Roughly one grace period. */
-
-static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
-module_param(rcu_idle_gp_delay, int, 0644);
-
-/*
- * Try to advance callbacks on the current CPU, but only if it has been
- * awhile since the last time we did so.  Afterwards, if there are any
- * callbacks ready for immediate invocation, return true.
- */
-static bool __maybe_unused rcu_try_advance_all_cbs(void)
-{
-	bool cbs_ready = false;
-	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-	struct rcu_node *rnp;
-
-	/* Exit early if we advanced recently. */
-	if (jiffies == rdp->last_advance_all)
-		return false;
-	rdp->last_advance_all = jiffies;
-
-	rnp = rdp->mynode;
-
-	/*
-	 * Don't bother checking unless a grace period has
-	 * completed since we last checked and there are
-	 * callbacks not yet ready to invoke.
-	 */
-	if ((rcu_seq_completed_gp(rdp->gp_seq,
-				  rcu_seq_current(&rnp->gp_seq)) ||
-	     unlikely(READ_ONCE(rdp->gpwrap))) &&
-	    rcu_segcblist_pend_cbs(&rdp->cblist))
-		note_gp_changes(rdp);
-
-	if (rcu_segcblist_ready_cbs(&rdp->cblist))
-		cbs_ready = true;
-	return cbs_ready;
-}
-
-/*
- * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
- * to invoke.  If the CPU has callbacks, try to advance them.  Tell the
- * caller about what to set the timeout.
- *
- * The caller must have disabled interrupts.
- */
-int rcu_needs_cpu(u64 basemono, u64 *nextevt)
-{
-	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-	unsigned long dj;
-
-	lockdep_assert_irqs_disabled();
-
-	/* If no non-offloaded callbacks, RCU doesn't need the CPU. */
-	if (rcu_segcblist_empty(&rdp->cblist) ||
-	    rcu_rdp_is_offloaded(rdp)) {
-		*nextevt = KTIME_MAX;
-		return 0;
-	}
-
-	/* Attempt to advance callbacks. */
-	if (rcu_try_advance_all_cbs()) {
-		/* Some ready to invoke, so initiate later invocation. */
-		invoke_rcu_core();
-		return 1;
-	}
-	rdp->last_accelerate = jiffies;
-
-	/* Request timer and round. */
-	dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies;
-
-	*nextevt = basemono + dj * TICK_NSEC;
-	return 0;
-}
-
-/*
- * Prepare a CPU for idle from an RCU perspective.  The first major task is to
- * sense whether nohz mode has been enabled or disabled via sysfs.  The second
- * major task is to accelerate (that is, assign grace-period numbers to) any
- * recently arrived callbacks.
- *
- * The caller must have disabled interrupts.
- */
-static void rcu_prepare_for_idle(void)
-{
-	bool needwake;
-	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-	struct rcu_node *rnp;
-	int tne;
-
-	lockdep_assert_irqs_disabled();
-	if (rcu_rdp_is_offloaded(rdp))
-		return;
-
-	/* Handle nohz enablement switches conservatively. */
-	tne = READ_ONCE(tick_nohz_active);
-	if (tne != rdp->tick_nohz_enabled_snap) {
-		if (!rcu_segcblist_empty(&rdp->cblist))
-			invoke_rcu_core(); /* force nohz to see update. */
-		rdp->tick_nohz_enabled_snap = tne;
-		return;
-	}
-	if (!tne)
-		return;
-
-	/*
-	 * If we have not yet accelerated this jiffy, accelerate all
-	 * callbacks on this CPU.
-	 */
-	if (rdp->last_accelerate == jiffies)
-		return;
-	rdp->last_accelerate = jiffies;
-	if (rcu_segcblist_pend_cbs(&rdp->cblist)) {
-		rnp = rdp->mynode;
-		raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
-		needwake = rcu_accelerate_cbs(rnp, rdp);
-		raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
-		if (needwake)
-			rcu_gp_kthread_wake();
-	}
-}
-
-/*
- * Clean up for exit from idle.  Attempt to advance callbacks based on
- * any grace periods that elapsed while the CPU was idle, and if any
- * callbacks are now ready to invoke, initiate invocation.
- */
-static void rcu_cleanup_after_idle(void)
-{
-	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-
-	lockdep_assert_irqs_disabled();
-	if (rcu_rdp_is_offloaded(rdp))
-		return;
-	if (rcu_try_advance_all_cbs())
-		invoke_rcu_core();
-}
-
-#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-
 /*
  * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
  * grace-period kthread will do force_quiescent_state() processing?
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 5e2fa6fd97f1..21bebf7c9030 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -347,26 +347,6 @@ static void rcu_dump_cpu_stacks(void)
 	}
 }
 
-#ifdef CONFIG_RCU_FAST_NO_HZ
-
-static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
-{
-	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
-
-	sprintf(cp, "last_accelerate: %04lx/%04lx dyntick_enabled: %d",
-		rdp->last_accelerate & 0xffff, jiffies & 0xffff,
-		!!rdp->tick_nohz_enabled_snap);
-}
-
-#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
-
-static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
-{
-	*cp = '\0';
-}
-
-#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
-
 static const char * const gp_state_names[] = {
 	[RCU_GP_IDLE] = "RCU_GP_IDLE",
 	[RCU_GP_WAIT_GPS] = "RCU_GP_WAIT_GPS",
@@ -408,13 +388,12 @@ static bool rcu_is_gp_kthread_starving(unsigned long *jp)
  * of RCU grace periods that this CPU is ignorant of, for example, "1"
  * if the CPU was aware of the previous grace period.
  *
- * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
+ * Also print out idle info.
  */
 static void print_cpu_stall_info(int cpu)
 {
 	unsigned long delta;
 	bool falsepositive;
-	char fast_no_hz[72];
 	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
 	char *ticks_title;
 	unsigned long ticks_value;
@@ -432,11 +411,10 @@ static void print_cpu_stall_info(int cpu)
 		ticks_title = "ticks this GP";
 		ticks_value = rdp->ticks_this_gp;
 	}
-	print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
 	delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq);
 	falsepositive = rcu_is_gp_kthread_starving(NULL) &&
 			rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp));
-	pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s%s\n",
+	pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n",
 	       cpu,
 	       "O."[!!cpu_online(cpu)],
 	       "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
@@ -449,7 +427,6 @@ static void print_cpu_stall_info(int cpu)
 	       rdp->dynticks_nesting, rdp->dynticks_nmi_nesting,
 	       rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
 	       data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
-	       fast_no_hz,
 	       falsepositive ? " (false positive?)" : "");
 }
 
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE01 b/tools/testing/selftests/rcutorture/configs/rcu/TREE01
index b5b53973c01e..8ae41d5f81a3 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE01
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE01
@@ -6,7 +6,6 @@ CONFIG_PREEMPT=y
 CONFIG_HZ_PERIODIC=n
 CONFIG_NO_HZ_IDLE=y
 CONFIG_NO_HZ_FULL=n
-CONFIG_RCU_FAST_NO_HZ=y
 CONFIG_RCU_TRACE=y
 CONFIG_HOTPLUG_CPU=y
 CONFIG_MAXSMP=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 b/tools/testing/selftests/rcutorture/configs/rcu/TREE04
index f6d6a40c0576..22ad0261728d 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04
@@ -7,7 +7,6 @@ CONFIG_PREEMPT=n
 CONFIG_HZ_PERIODIC=n
 CONFIG_NO_HZ_IDLE=n
 CONFIG_NO_HZ_FULL=y
-CONFIG_RCU_FAST_NO_HZ=y
 CONFIG_RCU_TRACE=y
 CONFIG_RCU_FANOUT=4
 CONFIG_RCU_FANOUT_LEAF=3
diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
index 1b96d68473b8..42acb1a64ce1 100644
--- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
+++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
@@ -15,7 +15,6 @@ CONFIG_PROVE_RCU -- Hardwired to CONFIG_PROVE_LOCKING.
 CONFIG_RCU_BOOST -- one of PREEMPT_RCU.
 CONFIG_RCU_FANOUT -- Cover hierarchy, but overlap with others.
 CONFIG_RCU_FANOUT_LEAF -- Do one non-default.
-CONFIG_RCU_FAST_NO_HZ -- Do one, but not with all nohz_full CPUs.
 CONFIG_RCU_NOCB_CPU -- Do three, one with no rcu_nocbs CPUs, one with
 	rcu_nocbs=0, and one with all rcu_nocbs CPUs.
 CONFIG_RCU_TRACE -- Do half.
-- 
cgit v1.2.3


From 937ed91c712273131de6d2a02caafd3ee84e0c72 Mon Sep 17 00:00:00 2001
From: Ammar Faizi <ammar.faizi@students.amikom.ac.id>
Date: Sun, 24 Oct 2021 19:28:14 +0200
Subject: tools/nolibc: x86-64: Fix startup code bug

Before this patch, the `_start` function looks like this:
```
0000000000001170 <_start>:
    1170:	pop    %rdi
    1171:	mov    %rsp,%rsi
    1174:	lea    0x8(%rsi,%rdi,8),%rdx
    1179:	and    $0xfffffffffffffff0,%rsp
    117d:	sub    $0x8,%rsp
    1181:	call   1000 <main>
    1186:	movzbq %al,%rdi
    118a:	mov    $0x3c,%rax
    1191:	syscall
    1193:	hlt
    1194:	data16 cs nopw 0x0(%rax,%rax,1)
    119f:	nop
```
Note the "and" to %rsp with $-16, it makes the %rsp be 16-byte aligned,
but then there is a "sub" with $0x8 which makes the %rsp no longer
16-byte aligned, then it calls main. That's the bug!

What actually the x86-64 System V ABI mandates is that right before the
"call", the %rsp must be 16-byte aligned, not after the "call". So the
"sub" with $0x8 here breaks the alignment. Remove it.

An example where this rule matters is when the callee needs to align
its stack at 16-byte for aligned move instruction, like `movdqa` and
`movaps`. If the callee can't align its stack properly, it will result
in segmentation fault.

x86-64 System V ABI also mandates the deepest stack frame should be
zero. Just to be safe, let's zero the %rbp on startup as the content
of %rbp may be unspecified when the program starts. Now it looks like
this:
```
0000000000001170 <_start>:
    1170:	pop    %rdi
    1171:	mov    %rsp,%rsi
    1174:	lea    0x8(%rsi,%rdi,8),%rdx
    1179:	xor    %ebp,%ebp                # zero the %rbp
    117b:	and    $0xfffffffffffffff0,%rsp # align the %rsp
    117f:	call   1000 <main>
    1184:	movzbq %al,%rdi
    1188:	mov    $0x3c,%rax
    118f:	syscall
    1191:	hlt
    1192:	data16 cs nopw 0x0(%rax,%rax,1)
    119d:	nopl   (%rax)
```

Cc: Bedirhan KURT <windowz414@gnuweeb.org>
Cc: Louvian Lyndal <louvianlyndal@gmail.com>
Reported-by: Peter Cordes <peter@cordes.ca>
Signed-off-by: Ammar Faizi <ammar.faizi@students.amikom.ac.id>
[wt: I did this on purpose due to a misunderstanding of the spec, other
     archs will thus have to be rechecked, particularly i386]
Cc: stable@vger.kernel.org
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/nolibc.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h
index 3430667b0d24..96b6d56acb57 100644
--- a/tools/include/nolibc/nolibc.h
+++ b/tools/include/nolibc/nolibc.h
@@ -399,14 +399,20 @@ struct stat {
 })
 
 /* startup code */
+/*
+ * x86-64 System V ABI mandates:
+ * 1) %rsp must be 16-byte aligned right before the function call.
+ * 2) The deepest stack frame should be zero (the %rbp).
+ *
+ */
 asm(".section .text\n"
     ".global _start\n"
     "_start:\n"
     "pop %rdi\n"                // argc   (first arg, %rdi)
     "mov %rsp, %rsi\n"          // argv[] (second arg, %rsi)
     "lea 8(%rsi,%rdi,8),%rdx\n" // then a NULL then envp (third arg, %rdx)
-    "and $-16, %rsp\n"          // x86 ABI : esp must be 16-byte aligned when
-    "sub $8, %rsp\n"            // entering the callee
+    "xor %ebp, %ebp\n"          // zero the stack frame
+    "and $-16, %rsp\n"          // x86 ABI : esp must be 16-byte aligned before call
     "call main\n"               // main() returns the status code, we'll exit with it.
     "movzb %al, %rdi\n"         // retrieve exit code from 8 lower bits
     "mov $60, %rax\n"           // NR_exit == 60
-- 
cgit v1.2.3


From ebbe0d8a449d183fa43b42d84fcb248e25303985 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Sun, 24 Oct 2021 19:28:15 +0200
Subject: tools/nolibc: i386: fix initial stack alignment

After re-checking in the spec and comparing stack offsets with glibc,
The last pushed argument must be 16-byte aligned (i.e. aligned before the
call) so that in the callee esp+4 is multiple of 16, so the principle is
the 32-bit equivalent to what Ammar fixed for x86_64. It's possible that
32-bit code using SSE2 or MMX could have been affected. In addition the
frame pointer ought to be zero at the deepest level.

Link: https://gitlab.com/x86-psABIs/i386-ABI/-/wikis/Intel386-psABI
Cc: Ammar Faizi <ammar.faizi@students.amikom.ac.id>
Cc: stable@vger.kernel.org
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/nolibc.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h
index 96b6d56acb57..7f300dc379e7 100644
--- a/tools/include/nolibc/nolibc.h
+++ b/tools/include/nolibc/nolibc.h
@@ -583,13 +583,21 @@ struct sys_stat_struct {
 })
 
 /* startup code */
+/*
+ * i386 System V ABI mandates:
+ * 1) last pushed argument must be 16-byte aligned.
+ * 2) The deepest stack frame should be set to zero
+ *
+ */
 asm(".section .text\n"
     ".global _start\n"
     "_start:\n"
     "pop %eax\n"                // argc   (first arg, %eax)
     "mov %esp, %ebx\n"          // argv[] (second arg, %ebx)
     "lea 4(%ebx,%eax,4),%ecx\n" // then a NULL then envp (third arg, %ecx)
-    "and $-16, %esp\n"          // x86 ABI : esp must be 16-byte aligned when
+    "xor %ebp, %ebp\n"          // zero the stack frame
+    "and $-16, %esp\n"          // x86 ABI : esp must be 16-byte aligned before
+    "sub $4, %esp\n"            // the call instruction (args are aligned)
     "push %ecx\n"               // push all registers on the stack so that we
     "push %ebx\n"               // support both regparm and plain stack modes
     "push %eax\n"
-- 
cgit v1.2.3


From de0244ae40ae91145faaf164a4252347607c3711 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Sun, 24 Oct 2021 19:28:16 +0200
Subject: tools/nolibc: fix incorrect truncation of exit code

Ammar Faizi reported that our exit code handling is wrong. We truncate
it to the lowest 8 bits but the syscall itself is expected to take a
regular 32-bit signed integer, not an unsigned char. It's the kernel
that later truncates it to the lowest 8 bits. The difference is visible
in strace, where the program below used to show exit(255) instead of
exit(-1):

  int main(void)
  {
        return -1;
  }

This patch applies the fix to all archs. x86_64, i386, arm64, armv7 and
mips were all tested and confirmed to work fine now. Risc-v was not
tested but the change is trivial and exactly the same as for other archs.

Reported-by: Ammar Faizi <ammar.faizi@students.amikom.ac.id>
Cc: stable@vger.kernel.org
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/nolibc.h | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h
index 7f300dc379e7..3e2c6f2ed587 100644
--- a/tools/include/nolibc/nolibc.h
+++ b/tools/include/nolibc/nolibc.h
@@ -414,7 +414,7 @@ asm(".section .text\n"
     "xor %ebp, %ebp\n"          // zero the stack frame
     "and $-16, %rsp\n"          // x86 ABI : esp must be 16-byte aligned before call
     "call main\n"               // main() returns the status code, we'll exit with it.
-    "movzb %al, %rdi\n"         // retrieve exit code from 8 lower bits
+    "mov %eax, %edi\n"          // retrieve exit code (32 bit)
     "mov $60, %rax\n"           // NR_exit == 60
     "syscall\n"                 // really exit
     "hlt\n"                     // ensure it does not return
@@ -602,9 +602,9 @@ asm(".section .text\n"
     "push %ebx\n"               // support both regparm and plain stack modes
     "push %eax\n"
     "call main\n"               // main() returns the status code in %eax
-    "movzbl %al, %ebx\n"        // retrieve exit code from lower 8 bits
-    "movl   $1, %eax\n"         // NR_exit == 1
-    "int    $0x80\n"            // exit now
+    "mov %eax, %ebx\n"          // retrieve exit code (32-bit int)
+    "movl $1, %eax\n"           // NR_exit == 1
+    "int $0x80\n"               // exit now
     "hlt\n"                     // ensure it does not
     "");
 
@@ -788,7 +788,6 @@ asm(".section .text\n"
     "and %r3, %r1, $-8\n"         // AAPCS : sp must be 8-byte aligned in the
     "mov %sp, %r3\n"              //         callee, an bl doesn't push (lr=pc)
     "bl main\n"                   // main() returns the status code, we'll exit with it.
-    "and %r0, %r0, $0xff\n"       // limit exit code to 8 bits
     "movs r7, $1\n"               // NR_exit == 1
     "svc $0x00\n"
     "");
@@ -985,7 +984,6 @@ asm(".section .text\n"
     "add x2, x2, x1\n"            //           + argv
     "and sp, x1, -16\n"           // sp must be 16-byte aligned in the callee
     "bl main\n"                   // main() returns the status code, we'll exit with it.
-    "and x0, x0, 0xff\n"          // limit exit code to 8 bits
     "mov x8, 93\n"                // NR_exit == 93
     "svc #0\n"
     "");
@@ -1190,7 +1188,7 @@ asm(".section .text\n"
     "addiu $sp,$sp,-16\n"         // the callee expects to save a0..a3 there!
     "jal main\n"                  // main() returns the status code, we'll exit with it.
     "nop\n"                       // delayed slot
-    "and $a0, $v0, 0xff\n"        // limit exit code to 8 bits
+    "move $a0, $v0\n"             // retrieve 32-bit exit code from v0
     "li $v0, 4001\n"              // NR_exit == 4001
     "syscall\n"
     ".end __start\n"
@@ -1388,7 +1386,6 @@ asm(".section .text\n"
     "add   a2,a2,a1\n"           //             + argv
     "andi  sp,a1,-16\n"          // sp must be 16-byte aligned
     "call  main\n"               // main() returns the status code, we'll exit with it.
-    "andi  a0, a0, 0xff\n"       // limit exit code to 8 bits
     "li a7, 93\n"                // NR_exit == 93
     "ecall\n"
     "");
-- 
cgit v1.2.3


From bf91666959eeac44fb686e9359e37830944beef2 Mon Sep 17 00:00:00 2001
From: Ammar Faizi <ammar.faizi@students.amikom.ac.id>
Date: Sun, 24 Oct 2021 19:43:21 +0200
Subject: tools/nolibc: x86: Remove `r8`, `r9` and `r10` from the clobber list

Linux x86-64 syscall only clobbers rax, rcx and r11 (and "memory").

  - rax for the return value.
  - rcx to save the return address.
  - r11 to save the rflags.

Other registers are preserved.

Having r8, r9 and r10 in the syscall clobber list is harmless, but this
results in a missed-optimization.

As the syscall doesn't clobber r8-r10, GCC should be allowed to reuse
their value after the syscall returns to userspace. But since they are
in the clobber list, GCC will always miss this opportunity.

Remove them from the x86-64 syscall clobber list to help GCC generate
better code and fix the comment.

See also the x86-64 ABI, section A.2 AMD64 Linux Kernel Conventions,
A.2.1 Calling Conventions [1].

Extra note:
Some people may think it does not really give a benefit to remove r8,
r9 and r10 from the syscall clobber list because the impression of
syscall is a C function call, and function call always clobbers those 3.

However, that is not the case for nolibc.h, because we have a potential
to inline the "syscall" instruction (which its opcode is "0f 05") to the
user functions.

All syscalls in the nolibc.h are written as a static function with inline
ASM and are likely always inline if we use optimization flag, so this is
a profit not to have r8, r9 and r10 in the clobber list.

Here is the example where this matters.

Consider the following C code:
```
  #include "tools/include/nolibc/nolibc.h"
  #define read_abc(a, b, c) __asm__ volatile("nop"::"r"(a),"r"(b),"r"(c))

  int main(void)
  {
  	int a = 0xaa;
  	int b = 0xbb;
  	int c = 0xcc;

  	read_abc(a, b, c);
  	write(1, "test\n", 5);
  	read_abc(a, b, c);

  	return 0;
  }
```

Compile with:
    gcc -Os test.c -o test -nostdlib

With r8, r9, r10 in the clobber list, GCC generates this:

0000000000001000 <main>:
    1000:	f3 0f 1e fa          	endbr64
    1004:	41 54                	push   %r12
    1006:	41 bc cc 00 00 00    	mov    $0xcc,%r12d
    100c:	55                   	push   %rbp
    100d:	bd bb 00 00 00       	mov    $0xbb,%ebp
    1012:	53                   	push   %rbx
    1013:	bb aa 00 00 00       	mov    $0xaa,%ebx
    1018:	90                   	nop
    1019:	b8 01 00 00 00       	mov    $0x1,%eax
    101e:	bf 01 00 00 00       	mov    $0x1,%edi
    1023:	ba 05 00 00 00       	mov    $0x5,%edx
    1028:	48 8d 35 d1 0f 00 00 	lea    0xfd1(%rip),%rsi
    102f:	0f 05                	syscall
    1031:	90                   	nop
    1032:	31 c0                	xor    %eax,%eax
    1034:	5b                   	pop    %rbx
    1035:	5d                   	pop    %rbp
    1036:	41 5c                	pop    %r12
    1038:	c3                   	ret

GCC thinks that syscall will clobber r8, r9, r10. So it spills 0xaa,
0xbb and 0xcc to callee saved registers (r12, rbp and rbx). This is
clearly extra memory access and extra stack size for preserving them.

But syscall does not actually clobber them, so this is a missed
optimization.

Now without r8, r9, r10 in the clobber list, GCC generates better code:

0000000000001000 <main>:
    1000:	f3 0f 1e fa          	endbr64
    1004:	41 b8 aa 00 00 00    	mov    $0xaa,%r8d
    100a:	41 b9 bb 00 00 00    	mov    $0xbb,%r9d
    1010:	41 ba cc 00 00 00    	mov    $0xcc,%r10d
    1016:	90                   	nop
    1017:	b8 01 00 00 00       	mov    $0x1,%eax
    101c:	bf 01 00 00 00       	mov    $0x1,%edi
    1021:	ba 05 00 00 00       	mov    $0x5,%edx
    1026:	48 8d 35 d3 0f 00 00 	lea    0xfd3(%rip),%rsi
    102d:	0f 05                	syscall
    102f:	90                   	nop
    1030:	31 c0                	xor    %eax,%eax
    1032:	c3                   	ret

Cc: Andy Lutomirski <luto@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: x86@kernel.org
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: David Laight <David.Laight@ACULAB.COM>
Acked-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Ammar Faizi <ammar.faizi@students.amikom.ac.id>
Link: https://gitlab.com/x86-psABIs/x86-64-ABI/-/wikis/x86-64-psABI [1]
Link: https://lore.kernel.org/lkml/20211011040344.437264-1-ammar.faizi@students.amikom.ac.id/
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/nolibc.h | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h
index 3e2c6f2ed587..f9afe89ec6f2 100644
--- a/tools/include/nolibc/nolibc.h
+++ b/tools/include/nolibc/nolibc.h
@@ -265,12 +265,17 @@ struct stat {
  *   - arguments are in rdi, rsi, rdx, r10, r8, r9 respectively
  *   - the system call is performed by calling the syscall instruction
  *   - syscall return comes in rax
- *   - rcx and r8..r11 may be clobbered, others are preserved.
+ *   - rcx and r11 are clobbered, others are preserved.
  *   - the arguments are cast to long and assigned into the target registers
  *     which are then simply passed as registers to the asm code, so that we
  *     don't have to experience issues with register constraints.
  *   - the syscall number is always specified last in order to allow to force
  *     some registers before (gcc refuses a %-register at the last position).
+ *   - see also x86-64 ABI section A.2 AMD64 Linux Kernel Conventions, A.2.1
+ *     Calling Conventions.
+ *
+ * Link x86-64 ABI: https://gitlab.com/x86-psABIs/x86-64-ABI/-/wikis/x86-64-psABI
+ *
  */
 
 #define my_syscall0(num)                                                      \
@@ -280,9 +285,9 @@ struct stat {
 									      \
 	asm volatile (                                                        \
 		"syscall\n"                                                   \
-		: "=a" (_ret)                                                 \
+		: "=a"(_ret)                                                  \
 		: "0"(_num)                                                   \
-		: "rcx", "r8", "r9", "r10", "r11", "memory", "cc"             \
+		: "rcx", "r11", "memory", "cc"                                \
 	);                                                                    \
 	_ret;                                                                 \
 })
@@ -295,10 +300,10 @@ struct stat {
 									      \
 	asm volatile (                                                        \
 		"syscall\n"                                                   \
-		: "=a" (_ret)                                                 \
+		: "=a"(_ret)                                                  \
 		: "r"(_arg1),                                                 \
 		  "0"(_num)                                                   \
-		: "rcx", "r8", "r9", "r10", "r11", "memory", "cc"             \
+		: "rcx", "r11", "memory", "cc"                                \
 	);                                                                    \
 	_ret;                                                                 \
 })
@@ -312,10 +317,10 @@ struct stat {
 									      \
 	asm volatile (                                                        \
 		"syscall\n"                                                   \
-		: "=a" (_ret)                                                 \
+		: "=a"(_ret)                                                  \
 		: "r"(_arg1), "r"(_arg2),                                     \
 		  "0"(_num)                                                   \
-		: "rcx", "r8", "r9", "r10", "r11", "memory", "cc"             \
+		: "rcx", "r11", "memory", "cc"                                \
 	);                                                                    \
 	_ret;                                                                 \
 })
@@ -330,10 +335,10 @@ struct stat {
 									      \
 	asm volatile (                                                        \
 		"syscall\n"                                                   \
-		: "=a" (_ret)                                                 \
+		: "=a"(_ret)                                                  \
 		: "r"(_arg1), "r"(_arg2), "r"(_arg3),                         \
 		  "0"(_num)                                                   \
-		: "rcx", "r8", "r9", "r10", "r11", "memory", "cc"             \
+		: "rcx", "r11", "memory", "cc"                                \
 	);                                                                    \
 	_ret;                                                                 \
 })
@@ -349,10 +354,10 @@ struct stat {
 									      \
 	asm volatile (                                                        \
 		"syscall\n"                                                   \
-		: "=a" (_ret), "=r"(_arg4)                                    \
+		: "=a"(_ret)                                                  \
 		: "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4),             \
 		  "0"(_num)                                                   \
-		: "rcx", "r8", "r9", "r11", "memory", "cc"                    \
+		: "rcx", "r11", "memory", "cc"                                \
 	);                                                                    \
 	_ret;                                                                 \
 })
@@ -369,10 +374,10 @@ struct stat {
 									      \
 	asm volatile (                                                        \
 		"syscall\n"                                                   \
-		: "=a" (_ret), "=r"(_arg4), "=r"(_arg5)                       \
+		: "=a"(_ret)                                                  \
 		: "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \
 		  "0"(_num)                                                   \
-		: "rcx", "r9", "r11", "memory", "cc"                          \
+		: "rcx", "r11", "memory", "cc"                                \
 	);                                                                    \
 	_ret;                                                                 \
 })
@@ -390,7 +395,7 @@ struct stat {
 									      \
 	asm volatile (                                                        \
 		"syscall\n"                                                   \
-		: "=a" (_ret), "=r"(_arg4), "=r"(_arg5)                       \
+		: "=a"(_ret)                                                  \
 		: "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \
 		  "r"(_arg6), "0"(_num)                                       \
 		: "rcx", "r11", "memory", "cc"                                \
-- 
cgit v1.2.3


From 7bdc0e7a390511cd3df8194003b908f15a6170a5 Mon Sep 17 00:00:00 2001
From: Ammar Faizi <ammar.faizi@students.amikom.ac.id>
Date: Sun, 24 Oct 2021 19:43:22 +0200
Subject: tools/nolibc: x86-64: Use `mov $60,%eax` instead of `mov $60,%rax`

Note that mov to 32-bit register will zero extend to 64-bit register.
Thus `mov $60,%eax` has the same effect with `mov $60,%rax`. Use the
shorter opcode to achieve the same thing.
```
  b8 3c 00 00 00       	mov    $60,%eax (5 bytes) [1]
  48 c7 c0 3c 00 00 00 	mov    $60,%rax (7 bytes) [2]
```
Currently, we use [2]. Change it to [1] for shorter code.

Signed-off-by: Ammar Faizi <ammar.faizi@students.amikom.ac.id>
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/nolibc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h
index f9afe89ec6f2..4988866af0b5 100644
--- a/tools/include/nolibc/nolibc.h
+++ b/tools/include/nolibc/nolibc.h
@@ -420,7 +420,7 @@ asm(".section .text\n"
     "and $-16, %rsp\n"          // x86 ABI : esp must be 16-byte aligned before call
     "call main\n"               // main() returns the status code, we'll exit with it.
     "mov %eax, %edi\n"          // retrieve exit code (32 bit)
-    "mov $60, %rax\n"           // NR_exit == 60
+    "mov $60, %eax\n"           // NR_exit == 60
     "syscall\n"                 // really exit
     "hlt\n"                     // ensure it does not return
     "");
-- 
cgit v1.2.3


From b0fe9dec66370cef23e9c281d229f4c44f84cfa2 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Sun, 24 Oct 2021 19:43:23 +0200
Subject: tools/nolibc: Implement gettid()

Allow test programs to determine their thread ID.

Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: Willy Tarreau <w@1wt.eu>
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/nolibc.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'tools')

diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h
index 4988866af0b5..c1c285fe494a 100644
--- a/tools/include/nolibc/nolibc.h
+++ b/tools/include/nolibc/nolibc.h
@@ -1571,6 +1571,12 @@ pid_t sys_getpid(void)
 	return my_syscall0(__NR_getpid);
 }
 
+static __attribute__((unused))
+pid_t sys_gettid(void)
+{
+	return my_syscall0(__NR_gettid);
+}
+
 static __attribute__((unused))
 int sys_gettimeofday(struct timeval *tv, struct timezone *tz)
 {
@@ -2029,6 +2035,18 @@ pid_t getpid(void)
 	return ret;
 }
 
+static __attribute__((unused))
+pid_t gettid(void)
+{
+	pid_t ret = sys_gettid();
+
+	if (ret < 0) {
+		SET_ERRNO(-ret);
+		ret = -1;
+	}
+	return ret;
+}
+
 static __attribute__((unused))
 int gettimeofday(struct timeval *tv, struct timezone *tz)
 {
-- 
cgit v1.2.3


From b6a4fd35d2d345fbfbd4421b0f626c3d8f1f6cf7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 21 Sep 2021 12:50:05 -0700
Subject: torture: Catch kvm.sh help text up with actual options

This commit brings the kvm.sh script's help text up to date with recently
(and some not-so-recently) added parameters.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/bin/kvm.sh | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index 6cf9ec6a3d1c..6de0c183db5b 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -74,7 +74,9 @@ usage () {
 	echo "       --help"
 	echo "       --interactive"
 	echo "       --jitter N [ maxsleep (us) [ maxspin (us) ] ]"
+	echo "       --kasan"
 	echo "       --kconfig Kconfig-options"
+	echo "       --kcsan"
 	echo "       --kmake-arg kernel-make-arguments"
 	echo "       --mac nn:nn:nn:nn:nn:nn"
 	echo "       --memory megabytes|nnnG"
@@ -83,6 +85,7 @@ usage () {
 	echo "       --qemu-cmd qemu-system-..."
 	echo "       --remote"
 	echo "       --results absolute-pathname"
+	echo "       --shutdown-grace seconds"
 	echo "       --torture lock|rcu|rcuscale|refscale|scf"
 	echo "       --trust-make"
 	exit 1
@@ -175,14 +178,14 @@ do
 		jitter="$2"
 		shift
 		;;
+	--kasan)
+		TORTURE_KCONFIG_KASAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KASAN=y"; export TORTURE_KCONFIG_KASAN_ARG
+		;;
 	--kconfig|--kconfigs)
 		checkarg --kconfig "(Kconfig options)" $# "$2" '^CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\)\( CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\)\)*$' '^error$'
 		TORTURE_KCONFIG_ARG="`echo "$TORTURE_KCONFIG_ARG $2" | sed -e 's/^ *//' -e 's/ *$//'`"
 		shift
 		;;
-	--kasan)
-		TORTURE_KCONFIG_KASAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KASAN=y"; export TORTURE_KCONFIG_KASAN_ARG
-		;;
 	--kcsan)
 		TORTURE_KCONFIG_KCSAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KCSAN=y CONFIG_KCSAN_STRICT=y CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y"; export TORTURE_KCONFIG_KCSAN_ARG
 		;;
-- 
cgit v1.2.3


From c06354a121777d3c6c5e60bed9d4f1aae04afd72 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 10 Nov 2021 07:47:41 -0800
Subject: torture: Make kvm-find-errors.sh report link-time undefined symbols

This commit makes kvm-find-errors.sh check for and report undefined
symbols that are detected at link time.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh | 4 ++--
 tools/testing/selftests/rcutorture/bin/parse-build.sh     | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh
index daf64b507038..2e9e9e2eedb6 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh
@@ -30,9 +30,9 @@ editor=${EDITOR-vi}
 files=
 for i in ${rundir}/*/Make.out
 do
-	if egrep -q "error:|warning:" < $i
+	if egrep -q "error:|warning:|^ld: .*undefined reference to" < $i
 	then
-		egrep "error:|warning:" < $i > $i.diags
+		egrep "error:|warning:|^ld: .*undefined reference to" < $i > $i.diags
 		files="$files $i.diags $i"
 	fi
 done
diff --git a/tools/testing/selftests/rcutorture/bin/parse-build.sh b/tools/testing/selftests/rcutorture/bin/parse-build.sh
index 9313e5065ae9..2dbfca3589b1 100755
--- a/tools/testing/selftests/rcutorture/bin/parse-build.sh
+++ b/tools/testing/selftests/rcutorture/bin/parse-build.sh
@@ -39,7 +39,8 @@ fi
 grep warning: < $F > $T/warnings
 grep "include/linux/*rcu*\.h:" $T/warnings > $T/hwarnings
 grep "kernel/rcu/[^/]*:" $T/warnings > $T/cwarnings
-cat $T/hwarnings $T/cwarnings > $T/rcuwarnings
+grep "^ld: .*undefined reference to" $T/warnings | head -1 > $T/ldwarnings
+cat $T/hwarnings $T/cwarnings $T/ldwarnings > $T/rcuwarnings
 if test -s $T/rcuwarnings
 then
 	print_warning $title build errors:
-- 
cgit v1.2.3


From f61537009e3a7c5be8d30ebf61cc9f8596800d4c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 22 Nov 2021 12:47:50 -0800
Subject: torture: Retry download once before giving up

Currently, a transient network error can kill a run if it happens while
downloading the tarball to one of the target systems.  This commit
therefore does a 60-second wait and then a retry.  If further experience
indicates, a more elaborate mechanism might be used later.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/bin/kvm-remote.sh | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh
index c7d42ef80c53..67ebc56111e3 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh
@@ -157,8 +157,15 @@ do
 	ret=$?
 	if test "$ret" -ne 0
 	then
-		echo Unable to download $T/binres.tgz to system $i, giving up. | tee -a "$oldrun/remote-log"
-		exit 10 | tee -a "$oldrun/remote-log"
+		echo Unable to download $T/binres.tgz to system $i, waiting and then retrying. | tee -a "$oldrun/remote-log"
+		sleep 60
+		cat $T/binres.tgz | ssh $i "cd /tmp; tar -xzf -"
+		ret=$?
+		if test "$ret" -ne 0
+		then
+			echo Unable to download $T/binres.tgz to system $i, giving up. | tee -a "$oldrun/remote-log"
+			exit 10
+		fi
 	fi
 done
 
-- 
cgit v1.2.3


From 4ead4e33194a4bb080f726fafef787b33ca98965 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 22 Nov 2021 21:02:43 -0800
Subject: rcutorture: Cause TREE02 and TREE10 scenarios to do more callback
 flooding

This commit enables two callback-flood kthreads for the TREE02 scenario
and 28 for the TREE10 scenario.

Cc: Neeraj Upadhyay <neeraj.iitr10@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/configs/rcu/TREE02.boot | 1 +
 tools/testing/selftests/rcutorture/configs/rcu/TREE10.boot | 1 +
 2 files changed, 2 insertions(+)
 create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/TREE02.boot
 create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/TREE10.boot

(limited to 'tools')

diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE02.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE02.boot
new file mode 100644
index 000000000000..dd914fa8f690
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE02.boot
@@ -0,0 +1 @@
+rcutorture.fwd_progress=2
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE10.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE10.boot
new file mode 100644
index 000000000000..dd914fa8f690
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE10.boot
@@ -0,0 +1 @@
+rcutorture.fwd_progress=2
-- 
cgit v1.2.3


From a959ed627a426b5bf01cf29c9b011f63459e2345 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 24 Nov 2021 16:04:51 -0800
Subject: rcutorture: Test RCU Tasks lock-contention detection

This commit adjusts the TRACE02 scenario to use a pair of callback-flood
kthreads.  This in turn forces lock contention on the single RCU Tasks
Trace callback queue, which forces use of all CPUs' queues, thus testing
this transition.  (No, there is not yet any way to transition back.

Cc: Neeraj Upadhyay <neeraj.iitr10@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/configs/rcu/TRACE02.boot | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02.boot b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02.boot
index 9675ad632dcc..c70b5db6c2ae 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02.boot
@@ -1 +1,2 @@
 rcutorture.torture_type=tasks-tracing
+rcutorture.fwd_progress=2
-- 
cgit v1.2.3


From b6c9dbf04f24e6bba91246abf55d1e0a13f5062a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 30 Nov 2021 11:33:01 -0800
Subject: torture: Fix incorrectly redirected "exit" in kvm-remote.sh

The "exit 4" in kvm-remote.sh is pointlessly redirected, so this commit
removes the redirection.

Fixes: 0092eae4cb4e ("torture: Add kvm-remote.sh script for distributed rcutorture test runs")
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/bin/kvm-remote.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh
index 67ebc56111e3..7ab79b0d5ad1 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh
@@ -144,7 +144,7 @@ do
 	if test "$ret" -ne 0
 	then
 		echo System $i unreachable, giving up. | tee -a "$oldrun/remote-log"
-		exit 4 | tee -a "$oldrun/remote-log"
+		exit 4
 	fi
 done
 
-- 
cgit v1.2.3


From 90b21bcfb2846625c4f2e4a0bf5969543cef8ba7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 30 Nov 2021 11:47:37 -0800
Subject: torture: Properly redirect kvm-remote.sh "echo" commands

The echo commands following initialization of the "oldrun" variable need
to be "tee"d to $oldrun/remote-log.  This commit fixes several stragglers.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/bin/kvm-remote.sh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh
index 7ab79b0d5ad1..e09b1bc78708 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh
@@ -184,16 +184,16 @@ checkremotefile () {
 		ret=$?
 		if test "$ret" -eq 255
 		then
-			echo " ---" ssh failure to $1 checking for file $2, retry after $sleeptime seconds. `date`
+			echo " ---" ssh failure to $1 checking for file $2, retry after $sleeptime seconds. `date` | tee -a "$oldrun/remote-log"
 		elif test "$ret" -eq 0
 		then
 			return 0
 		elif test "$ret" -eq 1
 		then
-			echo " ---" File \"$2\" not found: ssh $1 test -f \"$2\"
+			echo " ---" File \"$2\" not found: ssh $1 test -f \"$2\" | tee -a "$oldrun/remote-log"
 			return 1
 		else
-			echo " ---" Exit code $ret: ssh $1 test -f \"$2\", retry after $sleeptime seconds. `date`
+			echo " ---" Exit code $ret: ssh $1 test -f \"$2\", retry after $sleeptime seconds. `date` | tee -a "$oldrun/remote-log"
 			return $ret
 		fi
 		sleep $sleeptime
@@ -252,7 +252,7 @@ do
 		sleep 30
 	fi
 done
-echo All batches started. `date`
+echo All batches started. `date` | tee -a "$oldrun/remote-log"
 
 # Wait for all remaining scenarios to complete and collect results.
 for i in $systems
@@ -261,7 +261,7 @@ do
 	do
 		sleep 30
 	done
-	echo " ---" Collecting results from $i `date`
+	echo " ---" Collecting results from $i `date` | tee -a "$oldrun/remote-log"
 	( cd "$oldrun"; ssh $i "cd $rundir; tar -czf - kvm-remote-*.sh.out */console.log */kvm-test-1-run*.sh.out */qemu[_-]pid */qemu-retval */qemu-affinity; rm -rf $T > /dev/null 2>&1" | tar -xzf - )
 done
 
-- 
cgit v1.2.3


From ddfe12944e84830fe7dc490992e55b4fa773555e Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Mon, 25 Oct 2021 22:54:14 +0800
Subject: tools/memory-model: Provide extra ordering for unlock+lock pair on
 the same CPU

A recent discussion[1] shows that we are in favor of strengthening the
ordering of unlock + lock on the same CPU: a unlock and a po-after lock
should provide the so-called RCtso ordering, that is a memory access S
po-before the unlock should be ordered against a memory access R
po-after the lock, unless S is a store and R is a load.

The strengthening meets programmers' expection that "sequence of two
locked regions to be ordered wrt each other" (from Linus), and can
reduce the mental burden when using locks. Therefore add it in LKMM.

[1]: https://lore.kernel.org/lkml/20210909185937.GA12379@rowland.harvard.edu/

Co-developed-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Reviewed-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Acked-by: Palmer Dabbelt <palmerdabbelt@google.com> (RISC-V)
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/memory-model/Documentation/explanation.txt | 44 ++++++++++++++----------
 tools/memory-model/linux-kernel.cat              |  6 ++--
 2 files changed, 28 insertions(+), 22 deletions(-)

(limited to 'tools')

diff --git a/tools/memory-model/Documentation/explanation.txt b/tools/memory-model/Documentation/explanation.txt
index 5d72f3112e56..394ee57d58f2 100644
--- a/tools/memory-model/Documentation/explanation.txt
+++ b/tools/memory-model/Documentation/explanation.txt
@@ -1813,15 +1813,16 @@ spin_trylock() -- we can call these things lock-releases and
 lock-acquires -- have two properties beyond those of ordinary releases
 and acquires.
 
-First, when a lock-acquire reads from a lock-release, the LKMM
-requires that every instruction po-before the lock-release must
-execute before any instruction po-after the lock-acquire.  This would
-naturally hold if the release and acquire operations were on different
-CPUs, but the LKMM says it holds even when they are on the same CPU.
-For example:
+First, when a lock-acquire reads from or is po-after a lock-release,
+the LKMM requires that every instruction po-before the lock-release
+must execute before any instruction po-after the lock-acquire.  This
+would naturally hold if the release and acquire operations were on
+different CPUs and accessed the same lock variable, but the LKMM says
+it also holds when they are on the same CPU, even if they access
+different lock variables.  For example:
 
 	int x, y;
-	spinlock_t s;
+	spinlock_t s, t;
 
 	P0()
 	{
@@ -1830,9 +1831,9 @@ For example:
 		spin_lock(&s);
 		r1 = READ_ONCE(x);
 		spin_unlock(&s);
-		spin_lock(&s);
+		spin_lock(&t);
 		r2 = READ_ONCE(y);
-		spin_unlock(&s);
+		spin_unlock(&t);
 	}
 
 	P1()
@@ -1842,10 +1843,10 @@ For example:
 		WRITE_ONCE(x, 1);
 	}
 
-Here the second spin_lock() reads from the first spin_unlock(), and
-therefore the load of x must execute before the load of y.  Thus we
-cannot have r1 = 1 and r2 = 0 at the end (this is an instance of the
-MP pattern).
+Here the second spin_lock() is po-after the first spin_unlock(), and
+therefore the load of x must execute before the load of y, even though
+the two locking operations use different locks.  Thus we cannot have
+r1 = 1 and r2 = 0 at the end (this is an instance of the MP pattern).
 
 This requirement does not apply to ordinary release and acquire
 fences, only to lock-related operations.  For instance, suppose P0()
@@ -1872,13 +1873,13 @@ instructions in the following order:
 
 and thus it could load y before x, obtaining r2 = 0 and r1 = 1.
 
-Second, when a lock-acquire reads from a lock-release, and some other
-stores W and W' occur po-before the lock-release and po-after the
-lock-acquire respectively, the LKMM requires that W must propagate to
-each CPU before W' does.  For example, consider:
+Second, when a lock-acquire reads from or is po-after a lock-release,
+and some other stores W and W' occur po-before the lock-release and
+po-after the lock-acquire respectively, the LKMM requires that W must
+propagate to each CPU before W' does.  For example, consider:
 
 	int x, y;
-	spinlock_t x;
+	spinlock_t s;
 
 	P0()
 	{
@@ -1908,7 +1909,12 @@ each CPU before W' does.  For example, consider:
 
 If r1 = 1 at the end then the spin_lock() in P1 must have read from
 the spin_unlock() in P0.  Hence the store to x must propagate to P2
-before the store to y does, so we cannot have r2 = 1 and r3 = 0.
+before the store to y does, so we cannot have r2 = 1 and r3 = 0.  But
+if P1 had used a lock variable different from s, the writes could have
+propagated in either order.  (On the other hand, if the code in P0 and
+P1 had all executed on a single CPU, as in the example before this
+one, then the writes would have propagated in order even if the two
+critical sections used different lock variables.)
 
 These two special requirements for lock-release and lock-acquire do
 not arise from the operational model.  Nevertheless, kernel developers
diff --git a/tools/memory-model/linux-kernel.cat b/tools/memory-model/linux-kernel.cat
index 2a9b4fe4a84e..d70315fddef6 100644
--- a/tools/memory-model/linux-kernel.cat
+++ b/tools/memory-model/linux-kernel.cat
@@ -27,7 +27,7 @@ include "lock.cat"
 (* Release Acquire *)
 let acq-po = [Acquire] ; po ; [M]
 let po-rel = [M] ; po ; [Release]
-let po-unlock-rf-lock-po = po ; [UL] ; rf ; [LKR] ; po
+let po-unlock-lock-po = po ; [UL] ; (po|rf) ; [LKR] ; po
 
 (* Fences *)
 let R4rmb = R \ Noreturn	(* Reads for which rmb works *)
@@ -70,12 +70,12 @@ let rwdep = (dep | ctrl) ; [W]
 let overwrite = co | fr
 let to-w = rwdep | (overwrite & int) | (addr ; [Plain] ; wmb)
 let to-r = addr | (dep ; [Marked] ; rfi)
-let ppo = to-r | to-w | fence | (po-unlock-rf-lock-po & int)
+let ppo = to-r | to-w | fence | (po-unlock-lock-po & int)
 
 (* Propagation: Ordering from release operations and strong fences. *)
 let A-cumul(r) = (rfe ; [Marked])? ; r
 let cumul-fence = [Marked] ; (A-cumul(strong-fence | po-rel) | wmb |
-	po-unlock-rf-lock-po) ; [Marked]
+	po-unlock-lock-po) ; [Marked]
 let prop = [Marked] ; (overwrite & ext)? ; cumul-fence* ;
 	[Marked] ; rfe? ; [Marked]
 
-- 
cgit v1.2.3


From b47c05ecf60bd8743ad8c0ee510d3e1c060529d7 Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Mon, 25 Oct 2021 22:54:15 +0800
Subject: tools/memory-model: doc: Describe the requirement of the litmus-tests
 directory

It's better that we have some "standard" about which test should be put
in the litmus-tests directory because it helps future contributors
understand whether they should work on litmus-tests in kernel or Paul's
GitHub repo. Therefore explain a little bit on what a "representative"
litmus test is.

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/memory-model/README | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'tools')

diff --git a/tools/memory-model/README b/tools/memory-model/README
index 9a84c45504ab..9edd402704c4 100644
--- a/tools/memory-model/README
+++ b/tools/memory-model/README
@@ -195,6 +195,18 @@ litmus-tests
 	are listed in litmus-tests/README.  A great deal more litmus
 	tests are available at https://github.com/paulmckrcu/litmus.
 
+	By "representative", it means the one in the litmus-tests
+	directory is:
+
+		1) simple, the number of threads should be relatively
+		   small and each thread function should be relatively
+		   simple.
+		2) orthogonal, there should be no two litmus tests
+		   describing the same aspect of the memory model.
+		3) textbook, developers can easily copy-paste-modify
+		   the litmus tests to use the patterns on their own
+		   code.
+
 lock.cat
 	Provides a front-end analysis of lock acquisition and release,
 	for example, associating a lock acquisition with the preceding
-- 
cgit v1.2.3


From c438b7d860b4c1acb4ebff6d8d946d593ca5fe1e Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Mon, 25 Oct 2021 22:54:16 +0800
Subject: tools/memory-model: litmus: Add two tests for unlock(A)+lock(B)
 ordering

The memory model has been updated to provide a stronger ordering
guarantee for unlock(A)+lock(B) on the same CPU/thread. Therefore add
two litmus tests describing this new guarantee, these tests are simple
yet can clearly show the usage of the new guarantee, also they can serve
as the self tests for the modification in the model.

Co-developed-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 .../LB+unlocklockonceonce+poacquireonce.litmus     | 35 ++++++++++++++++++++++
 .../MP+unlocklockonceonce+fencermbonceonce.litmus  | 33 ++++++++++++++++++++
 tools/memory-model/litmus-tests/README             |  8 +++++
 3 files changed, 76 insertions(+)
 create mode 100644 tools/memory-model/litmus-tests/LB+unlocklockonceonce+poacquireonce.litmus
 create mode 100644 tools/memory-model/litmus-tests/MP+unlocklockonceonce+fencermbonceonce.litmus

(limited to 'tools')

diff --git a/tools/memory-model/litmus-tests/LB+unlocklockonceonce+poacquireonce.litmus b/tools/memory-model/litmus-tests/LB+unlocklockonceonce+poacquireonce.litmus
new file mode 100644
index 000000000000..eb34123a6ffe
--- /dev/null
+++ b/tools/memory-model/litmus-tests/LB+unlocklockonceonce+poacquireonce.litmus
@@ -0,0 +1,35 @@
+C LB+unlocklockonceonce+poacquireonce
+
+(*
+ * Result: Never
+ *
+ * If two locked critical sections execute on the same CPU, all accesses
+ * in the first must execute before any accesses in the second, even if the
+ * critical sections are protected by different locks.  Note: Even when a
+ * write executes before a read, their memory effects can be reordered from
+ * the viewpoint of another CPU (the kind of reordering allowed by TSO).
+ *)
+
+{}
+
+P0(spinlock_t *s, spinlock_t *t, int *x, int *y)
+{
+	int r1;
+
+	spin_lock(s);
+	r1 = READ_ONCE(*x);
+	spin_unlock(s);
+	spin_lock(t);
+	WRITE_ONCE(*y, 1);
+	spin_unlock(t);
+}
+
+P1(int *x, int *y)
+{
+	int r2;
+
+	r2 = smp_load_acquire(y);
+	WRITE_ONCE(*x, 1);
+}
+
+exists (0:r1=1 /\ 1:r2=1)
diff --git a/tools/memory-model/litmus-tests/MP+unlocklockonceonce+fencermbonceonce.litmus b/tools/memory-model/litmus-tests/MP+unlocklockonceonce+fencermbonceonce.litmus
new file mode 100644
index 000000000000..2feb1398be71
--- /dev/null
+++ b/tools/memory-model/litmus-tests/MP+unlocklockonceonce+fencermbonceonce.litmus
@@ -0,0 +1,33 @@
+C MP+unlocklockonceonce+fencermbonceonce
+
+(*
+ * Result: Never
+ *
+ * If two locked critical sections execute on the same CPU, stores in the
+ * first must propagate to each CPU before stores in the second do, even if
+ * the critical sections are protected by different locks.
+ *)
+
+{}
+
+P0(spinlock_t *s, spinlock_t *t, int *x, int *y)
+{
+	spin_lock(s);
+	WRITE_ONCE(*x, 1);
+	spin_unlock(s);
+	spin_lock(t);
+	WRITE_ONCE(*y, 1);
+	spin_unlock(t);
+}
+
+P1(int *x, int *y)
+{
+	int r1;
+	int r2;
+
+	r1 = READ_ONCE(*y);
+	smp_rmb();
+	r2 = READ_ONCE(*x);
+}
+
+exists (1:r1=1 /\ 1:r2=0)
diff --git a/tools/memory-model/litmus-tests/README b/tools/memory-model/litmus-tests/README
index 681f9067fa9e..d311a0ff1ae6 100644
--- a/tools/memory-model/litmus-tests/README
+++ b/tools/memory-model/litmus-tests/README
@@ -63,6 +63,10 @@ LB+poonceonces.litmus
 	As above, but with store-release replaced with WRITE_ONCE()
 	and load-acquire replaced with READ_ONCE().
 
+LB+unlocklockonceonce+poacquireonce.litmus
+	Does a unlock+lock pair provides ordering guarantee between a
+	load and a store?
+
 MP+onceassign+derefonce.litmus
 	As below, but with rcu_assign_pointer() and an rcu_dereference().
 
@@ -90,6 +94,10 @@ MP+porevlocks.litmus
 	As below, but with the first access of the writer process
 	and the second access of reader process protected by a lock.
 
+MP+unlocklockonceonce+fencermbonceonce.litmus
+	Does a unlock+lock pair provides ordering guarantee between a
+	store and another store?
+
 MP+fencewmbonceonce+fencermbonceonce.litmus
 	Does a smp_wmb() (between the stores) and an smp_rmb() (between
 	the loads) suffice for the message-passing litmus test, where one
-- 
cgit v1.2.3


From a05431b22be819d75db72ca3d44381d18a37b092 Mon Sep 17 00:00:00 2001
From: Li Zhijian <lizhijian@cn.fujitsu.com>
Date: Thu, 2 Dec 2021 10:28:41 +0800
Subject: selftests: net: Correct case name

ipv6_addr_bind/ipv4_addr_bind are function names. Previously, bind test
would not be run by default due to the wrong case names

Fixes: 34d0302ab861 ("selftests: Add ipv6 address bind tests to fcnal-test")
Fixes: 75b2b2b3db4c ("selftests: Add ipv4 address bind tests to fcnal-test")
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/fcnal-test.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh
index 3313566ce906..7f5b265fcb90 100755
--- a/tools/testing/selftests/net/fcnal-test.sh
+++ b/tools/testing/selftests/net/fcnal-test.sh
@@ -4002,8 +4002,8 @@ EOF
 ################################################################################
 # main
 
-TESTS_IPV4="ipv4_ping ipv4_tcp ipv4_udp ipv4_addr_bind ipv4_runtime ipv4_netfilter"
-TESTS_IPV6="ipv6_ping ipv6_tcp ipv6_udp ipv6_addr_bind ipv6_runtime ipv6_netfilter"
+TESTS_IPV4="ipv4_ping ipv4_tcp ipv4_udp ipv4_bind ipv4_runtime ipv4_netfilter"
+TESTS_IPV6="ipv6_ping ipv6_tcp ipv6_udp ipv6_bind ipv6_runtime ipv6_netfilter"
 TESTS_OTHER="use_cases"
 
 PAUSE_ON_FAIL=no
-- 
cgit v1.2.3


From 36d7d36fcf69e7f95068307f8bc519af31085431 Mon Sep 17 00:00:00 2001
From: Li Zhijian <lizhijian@cn.fujitsu.com>
Date: Thu, 2 Dec 2021 10:29:53 +0800
Subject: selftests: net: remove meaningless help option

$ ./fcnal-test.sh -t help
Test names: help

Looks it intent to list the available tests but it didn't do the right
thing. I will add another option the do that in the later patch.

Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/fcnal-test.sh | 2 --
 1 file changed, 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh
index 7caa4f0e067d..128f791a5b7d 100755
--- a/tools/testing/selftests/net/fcnal-test.sh
+++ b/tools/testing/selftests/net/fcnal-test.sh
@@ -4108,8 +4108,6 @@ do
 	# setup namespaces and config, but do not run any tests
 	setup)		 setup; exit 0;;
 	vrf_setup)	 setup "yes"; exit 0;;
-
-	help)            echo "Test names: $TESTS"; exit 0;;
 	esac
 done
 
-- 
cgit v1.2.3


From 74753e1462e77349525daf9eb60ea21ed92d3a97 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 1 Dec 2021 10:10:24 -0800
Subject: libbpf: Replace btf__type_by_id() with btf_type_by_id().

To prepare relo_core.c to be compiled in the kernel and the user space
replace btf__type_by_id with btf_type_by_id.

In libbpf btf__type_by_id and btf_type_by_id have different behavior.

bpf_core_apply_relo_insn() needs behavior of uapi btf__type_by_id
vs internal btf_type_by_id, but type_id range check is already done
in bpf_core_apply_relo(), so it's safe to replace it everywhere.
The kernel btf_type_by_id() does the check anyway. It doesn't hurt.

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211201181040.23337-2-alexei.starovoitov@gmail.com
---
 tools/lib/bpf/btf.c             |  2 +-
 tools/lib/bpf/libbpf_internal.h |  2 +-
 tools/lib/bpf/relo_core.c       | 19 ++++++++-----------
 3 files changed, 10 insertions(+), 13 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 8024fe355ca8..0d7b16eab569 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -454,7 +454,7 @@ const struct btf *btf__base_btf(const struct btf *btf)
 }
 
 /* internal helper returning non-const pointer to a type */
-struct btf_type *btf_type_by_id(struct btf *btf, __u32 type_id)
+struct btf_type *btf_type_by_id(const struct btf *btf, __u32 type_id)
 {
 	if (type_id == 0)
 		return &btf_void;
diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index 311905d8ca70..6f143e9e810c 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -172,7 +172,7 @@ static inline void *libbpf_reallocarray(void *ptr, size_t nmemb, size_t size)
 struct btf;
 struct btf_type;
 
-struct btf_type *btf_type_by_id(struct btf *btf, __u32 type_id);
+struct btf_type *btf_type_by_id(const struct btf *btf, __u32 type_id);
 const char *btf_kind_str(const struct btf_type *t);
 const struct btf_type *skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id);
 
diff --git a/tools/lib/bpf/relo_core.c b/tools/lib/bpf/relo_core.c
index b5b8956a1be8..c0904f4cb514 100644
--- a/tools/lib/bpf/relo_core.c
+++ b/tools/lib/bpf/relo_core.c
@@ -51,7 +51,7 @@ static bool is_flex_arr(const struct btf *btf,
 		return false;
 
 	/* has to be the last member of enclosing struct */
-	t = btf__type_by_id(btf, acc->type_id);
+	t = btf_type_by_id(btf, acc->type_id);
 	return acc->idx == btf_vlen(t) - 1;
 }
 
@@ -388,7 +388,7 @@ static int bpf_core_match_member(const struct btf *local_btf,
 		return 0;
 
 	local_id = local_acc->type_id;
-	local_type = btf__type_by_id(local_btf, local_id);
+	local_type = btf_type_by_id(local_btf, local_id);
 	local_member = btf_members(local_type) + local_acc->idx;
 	local_name = btf__name_by_offset(local_btf, local_member->name_off);
 
@@ -580,7 +580,7 @@ static int bpf_core_calc_field_relo(const char *prog_name,
 		return -EUCLEAN; /* request instruction poisoning */
 
 	acc = &spec->spec[spec->len - 1];
-	t = btf__type_by_id(spec->btf, acc->type_id);
+	t = btf_type_by_id(spec->btf, acc->type_id);
 
 	/* a[n] accessor needs special handling */
 	if (!acc->name) {
@@ -729,7 +729,7 @@ static int bpf_core_calc_enumval_relo(const struct bpf_core_relo *relo,
 	case BPF_ENUMVAL_VALUE:
 		if (!spec)
 			return -EUCLEAN; /* request instruction poisoning */
-		t = btf__type_by_id(spec->btf, spec->spec[0].type_id);
+		t = btf_type_by_id(spec->btf, spec->spec[0].type_id);
 		e = btf_enum(t) + spec->spec[0].idx;
 		*val = e->val;
 		break;
@@ -805,8 +805,8 @@ static int bpf_core_calc_relo(const char *prog_name,
 		if (res->orig_sz != res->new_sz) {
 			const struct btf_type *orig_t, *new_t;
 
-			orig_t = btf__type_by_id(local_spec->btf, res->orig_type_id);
-			new_t = btf__type_by_id(targ_spec->btf, res->new_type_id);
+			orig_t = btf_type_by_id(local_spec->btf, res->orig_type_id);
+			new_t = btf_type_by_id(targ_spec->btf, res->new_type_id);
 
 			/* There are two use cases in which it's safe to
 			 * adjust load/store's mem size:
@@ -1054,7 +1054,7 @@ static void bpf_core_dump_spec(int level, const struct bpf_core_spec *spec)
 	int i;
 
 	type_id = spec->root_type_id;
-	t = btf__type_by_id(spec->btf, type_id);
+	t = btf_type_by_id(spec->btf, type_id);
 	s = btf__name_by_offset(spec->btf, t->name_off);
 
 	libbpf_print(level, "[%u] %s %s", type_id, btf_kind_str(t), str_is_empty(s) ? "<anon>" : s);
@@ -1158,10 +1158,7 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn,
 	int i, j, err;
 
 	local_id = relo->type_id;
-	local_type = btf__type_by_id(local_btf, local_id);
-	if (!local_type)
-		return -EINVAL;
-
+	local_type = btf_type_by_id(local_btf, local_id);
 	local_name = btf__name_by_offset(local_btf, local_type->name_off);
 	if (!local_name)
 		return -EINVAL;
-- 
cgit v1.2.3


From 29db4bea1d10b73749d7992c1fc9ac13499e8871 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 1 Dec 2021 10:10:26 -0800
Subject: bpf: Prepare relo_core.c for kernel duty.

Make relo_core.c to be compiled for the kernel and for user space libbpf.

Note the patch is reducing BPF_CORE_SPEC_MAX_LEN from 64 to 32.
This is the maximum number of nested structs and arrays.
For example:
 struct sample {
     int a;
     struct {
         int b[10];
     };
 };

 struct sample *s = ...;
 int *y = &s->b[5];
This field access is encoded as "0:1:0:5" and spec len is 4.

The follow up patch might bump it back to 64.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211201181040.23337-4-alexei.starovoitov@gmail.com
---
 include/linux/btf.h       | 81 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/Makefile       |  4 +++
 kernel/bpf/btf.c          | 26 +++++++++++++++
 tools/lib/bpf/relo_core.c | 76 +++++++++++++++++++++++++++++++++++++-------
 4 files changed, 176 insertions(+), 11 deletions(-)

(limited to 'tools')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index 956f70388f69..acef6ef28768 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -144,6 +144,53 @@ static inline bool btf_type_is_enum(const struct btf_type *t)
 	return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM;
 }
 
+static inline bool str_is_empty(const char *s)
+{
+	return !s || !s[0];
+}
+
+static inline u16 btf_kind(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info);
+}
+
+static inline bool btf_is_enum(const struct btf_type *t)
+{
+	return btf_kind(t) == BTF_KIND_ENUM;
+}
+
+static inline bool btf_is_composite(const struct btf_type *t)
+{
+	u16 kind = btf_kind(t);
+
+	return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
+}
+
+static inline bool btf_is_array(const struct btf_type *t)
+{
+	return btf_kind(t) == BTF_KIND_ARRAY;
+}
+
+static inline bool btf_is_int(const struct btf_type *t)
+{
+	return btf_kind(t) == BTF_KIND_INT;
+}
+
+static inline bool btf_is_ptr(const struct btf_type *t)
+{
+	return btf_kind(t) == BTF_KIND_PTR;
+}
+
+static inline u8 btf_int_offset(const struct btf_type *t)
+{
+	return BTF_INT_OFFSET(*(u32 *)(t + 1));
+}
+
+static inline u8 btf_int_encoding(const struct btf_type *t)
+{
+	return BTF_INT_ENCODING(*(u32 *)(t + 1));
+}
+
 static inline bool btf_type_is_scalar(const struct btf_type *t)
 {
 	return btf_type_is_int(t) || btf_type_is_enum(t);
@@ -184,6 +231,11 @@ static inline u16 btf_type_vlen(const struct btf_type *t)
 	return BTF_INFO_VLEN(t->info);
 }
 
+static inline u16 btf_vlen(const struct btf_type *t)
+{
+	return btf_type_vlen(t);
+}
+
 static inline u16 btf_func_linkage(const struct btf_type *t)
 {
 	return BTF_INFO_VLEN(t->info);
@@ -208,11 +260,40 @@ static inline u32 __btf_member_bitfield_size(const struct btf_type *struct_type,
 					   : 0;
 }
 
+static inline struct btf_member *btf_members(const struct btf_type *t)
+{
+	return (struct btf_member *)(t + 1);
+}
+
+static inline u32 btf_member_bit_offset(const struct btf_type *t, u32 member_idx)
+{
+	const struct btf_member *m = btf_members(t) + member_idx;
+
+	return __btf_member_bit_offset(t, m);
+}
+
+static inline u32 btf_member_bitfield_size(const struct btf_type *t, u32 member_idx)
+{
+	const struct btf_member *m = btf_members(t) + member_idx;
+
+	return __btf_member_bitfield_size(t, m);
+}
+
 static inline const struct btf_member *btf_type_member(const struct btf_type *t)
 {
 	return (const struct btf_member *)(t + 1);
 }
 
+static inline struct btf_array *btf_array(const struct btf_type *t)
+{
+	return (struct btf_array *)(t + 1);
+}
+
+static inline struct btf_enum *btf_enum(const struct btf_type *t)
+{
+	return (struct btf_enum *)(t + 1);
+}
+
 static inline const struct btf_var_secinfo *btf_type_var_secinfo(
 		const struct btf_type *t)
 {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index cf6ca339f3cd..c1a9be6a4b9f 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -36,3 +36,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
 obj-${CONFIG_BPF_LSM} += bpf_lsm.o
 endif
 obj-$(CONFIG_BPF_PRELOAD) += preload/
+
+obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
+$(obj)/relo_core.o: $(srctree)/tools/lib/bpf/relo_core.c FORCE
+	$(call if_changed_rule,cc_o_c)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index f4119a99da7b..c79595aad55b 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6413,3 +6413,29 @@ bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id,
 
 DEFINE_KFUNC_BTF_ID_LIST(bpf_tcp_ca_kfunc_list);
 DEFINE_KFUNC_BTF_ID_LIST(prog_test_kfunc_list);
+
+int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
+			      const struct btf *targ_btf, __u32 targ_id)
+{
+	return -EOPNOTSUPP;
+}
+
+static bool bpf_core_is_flavor_sep(const char *s)
+{
+	/* check X___Y name pattern, where X and Y are not underscores */
+	return s[0] != '_' &&				      /* X */
+	       s[1] == '_' && s[2] == '_' && s[3] == '_' &&   /* ___ */
+	       s[4] != '_';				      /* Y */
+}
+
+size_t bpf_core_essential_name_len(const char *name)
+{
+	size_t n = strlen(name);
+	int i;
+
+	for (i = n - 5; i >= 0; i--) {
+		if (bpf_core_is_flavor_sep(name + i))
+			return i + 1;
+	}
+	return n;
+}
diff --git a/tools/lib/bpf/relo_core.c b/tools/lib/bpf/relo_core.c
index c0904f4cb514..56dbe6d16664 100644
--- a/tools/lib/bpf/relo_core.c
+++ b/tools/lib/bpf/relo_core.c
@@ -1,6 +1,60 @@
 // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
 /* Copyright (c) 2019 Facebook */
 
+#ifdef __KERNEL__
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/string.h>
+#include <linux/bpf_verifier.h>
+#include "relo_core.h"
+
+static const char *btf_kind_str(const struct btf_type *t)
+{
+	return btf_type_str(t);
+}
+
+static bool is_ldimm64_insn(struct bpf_insn *insn)
+{
+	return insn->code == (BPF_LD | BPF_IMM | BPF_DW);
+}
+
+static const struct btf_type *
+skip_mods_and_typedefs(const struct btf *btf, u32 id, u32 *res_id)
+{
+	return btf_type_skip_modifiers(btf, id, res_id);
+}
+
+static const char *btf__name_by_offset(const struct btf *btf, u32 offset)
+{
+	return btf_name_by_offset(btf, offset);
+}
+
+static s64 btf__resolve_size(const struct btf *btf, u32 type_id)
+{
+	const struct btf_type *t;
+	int size;
+
+	t = btf_type_by_id(btf, type_id);
+	t = btf_resolve_size(btf, t, &size);
+	if (IS_ERR(t))
+		return PTR_ERR(t);
+	return size;
+}
+
+enum libbpf_print_level {
+	LIBBPF_WARN,
+	LIBBPF_INFO,
+	LIBBPF_DEBUG,
+};
+
+#undef pr_warn
+#undef pr_info
+#undef pr_debug
+#define pr_warn(fmt, log, ...)	bpf_log((void *)log, fmt, "", ##__VA_ARGS__)
+#define pr_info(fmt, log, ...)	bpf_log((void *)log, fmt, "", ##__VA_ARGS__)
+#define pr_debug(fmt, log, ...)	bpf_log((void *)log, fmt, "", ##__VA_ARGS__)
+#define libbpf_print(level, fmt, ...)	bpf_log((void *)prog_name, fmt, ##__VA_ARGS__)
+#else
 #include <stdio.h>
 #include <string.h>
 #include <errno.h>
@@ -12,8 +66,9 @@
 #include "btf.h"
 #include "str_error.h"
 #include "libbpf_internal.h"
+#endif
 
-#define BPF_CORE_SPEC_MAX_LEN 64
+#define BPF_CORE_SPEC_MAX_LEN 32
 
 /* represents BPF CO-RE field or array element accessor */
 struct bpf_core_accessor {
@@ -150,7 +205,7 @@ static bool core_relo_is_enumval_based(enum bpf_core_relo_kind kind)
  * Enum value-based relocations (ENUMVAL_EXISTS/ENUMVAL_VALUE) use access
  * string to specify enumerator's value index that need to be relocated.
  */
-static int bpf_core_parse_spec(const struct btf *btf,
+static int bpf_core_parse_spec(const char *prog_name, const struct btf *btf,
 			       __u32 type_id,
 			       const char *spec_str,
 			       enum bpf_core_relo_kind relo_kind,
@@ -272,8 +327,8 @@ static int bpf_core_parse_spec(const struct btf *btf,
 				return sz;
 			spec->bit_offset += access_idx * sz * 8;
 		} else {
-			pr_warn("relo for [%u] %s (at idx %d) captures type [%d] of unexpected kind %s\n",
-				type_id, spec_str, i, id, btf_kind_str(t));
+			pr_warn("prog '%s': relo for [%u] %s (at idx %d) captures type [%d] of unexpected kind %s\n",
+				prog_name, type_id, spec_str, i, id, btf_kind_str(t));
 			return -EINVAL;
 		}
 	}
@@ -346,8 +401,6 @@ recur:
 		targ_id = btf_array(targ_type)->type;
 		goto recur;
 	default:
-		pr_warn("unexpected kind %d relocated, local [%d], target [%d]\n",
-			btf_kind(local_type), local_id, targ_id);
 		return 0;
 	}
 }
@@ -1045,7 +1098,7 @@ poison:
  * [<type-id>] (<type-name>) + <raw-spec> => <offset>@<spec>,
  * where <spec> is a C-syntax view of recorded field access, e.g.: x.a[3].b
  */
-static void bpf_core_dump_spec(int level, const struct bpf_core_spec *spec)
+static void bpf_core_dump_spec(const char *prog_name, int level, const struct bpf_core_spec *spec)
 {
 	const struct btf_type *t;
 	const struct btf_enum *e;
@@ -1167,7 +1220,8 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn,
 	if (str_is_empty(spec_str))
 		return -EINVAL;
 
-	err = bpf_core_parse_spec(local_btf, local_id, spec_str, relo->kind, &local_spec);
+	err = bpf_core_parse_spec(prog_name, local_btf, local_id, spec_str,
+				  relo->kind, &local_spec);
 	if (err) {
 		pr_warn("prog '%s': relo #%d: parsing [%d] %s %s + %s failed: %d\n",
 			prog_name, relo_idx, local_id, btf_kind_str(local_type),
@@ -1178,7 +1232,7 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn,
 
 	pr_debug("prog '%s': relo #%d: kind <%s> (%d), spec is ", prog_name,
 		 relo_idx, core_relo_kind_str(relo->kind), relo->kind);
-	bpf_core_dump_spec(LIBBPF_DEBUG, &local_spec);
+	bpf_core_dump_spec(prog_name, LIBBPF_DEBUG, &local_spec);
 	libbpf_print(LIBBPF_DEBUG, "\n");
 
 	/* TYPE_ID_LOCAL relo is special and doesn't need candidate search */
@@ -1204,14 +1258,14 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn,
 		if (err < 0) {
 			pr_warn("prog '%s': relo #%d: error matching candidate #%d ",
 				prog_name, relo_idx, i);
-			bpf_core_dump_spec(LIBBPF_WARN, &cand_spec);
+			bpf_core_dump_spec(prog_name, LIBBPF_WARN, &cand_spec);
 			libbpf_print(LIBBPF_WARN, ": %d\n", err);
 			return err;
 		}
 
 		pr_debug("prog '%s': relo #%d: %s candidate #%d ", prog_name,
 			 relo_idx, err == 0 ? "non-matching" : "matching", i);
-		bpf_core_dump_spec(LIBBPF_DEBUG, &cand_spec);
+		bpf_core_dump_spec(prog_name, LIBBPF_DEBUG, &cand_spec);
 		libbpf_print(LIBBPF_DEBUG, "\n");
 
 		if (err == 0)
-- 
cgit v1.2.3


From 46334a0cd21bed70d6f1ddef1464f75a0ebe1774 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 1 Dec 2021 10:10:27 -0800
Subject: bpf: Define enum bpf_core_relo_kind as uapi.

enum bpf_core_relo_kind is generated by llvm and processed by libbpf.
It's a de-facto uapi.
With CO-RE in the kernel the bpf_core_relo_kind values become uapi de-jure.
Also rename them with BPF_CORE_ prefix to distinguish from conflicting names in
bpf_core_read.h. The enums bpf_field_info_kind, bpf_type_id_kind,
bpf_type_info_kind, bpf_enum_value_kind are passing different values from bpf
program into llvm.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211201181040.23337-5-alexei.starovoitov@gmail.com
---
 include/uapi/linux/bpf.h       | 19 ++++++++++
 tools/include/uapi/linux/bpf.h | 19 ++++++++++
 tools/lib/bpf/libbpf.c         |  2 +-
 tools/lib/bpf/relo_core.c      | 84 +++++++++++++++++++++---------------------
 tools/lib/bpf/relo_core.h      | 18 +--------
 5 files changed, 82 insertions(+), 60 deletions(-)

(limited to 'tools')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 211b43afd0fb..9e66b1880020 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6374,4 +6374,23 @@ enum {
 	BTF_F_ZERO	=	(1ULL << 3),
 };
 
+/* bpf_core_relo_kind encodes which aspect of captured field/type/enum value
+ * has to be adjusted by relocations. It is emitted by llvm and passed to
+ * libbpf and later to the kernel.
+ */
+enum bpf_core_relo_kind {
+	BPF_CORE_FIELD_BYTE_OFFSET = 0,      /* field byte offset */
+	BPF_CORE_FIELD_BYTE_SIZE = 1,        /* field size in bytes */
+	BPF_CORE_FIELD_EXISTS = 2,           /* field existence in target kernel */
+	BPF_CORE_FIELD_SIGNED = 3,           /* field signedness (0 - unsigned, 1 - signed) */
+	BPF_CORE_FIELD_LSHIFT_U64 = 4,       /* bitfield-specific left bitshift */
+	BPF_CORE_FIELD_RSHIFT_U64 = 5,       /* bitfield-specific right bitshift */
+	BPF_CORE_TYPE_ID_LOCAL = 6,          /* type ID in local BPF object */
+	BPF_CORE_TYPE_ID_TARGET = 7,         /* type ID in target kernel */
+	BPF_CORE_TYPE_EXISTS = 8,            /* type existence in target kernel */
+	BPF_CORE_TYPE_SIZE = 9,              /* type size in bytes */
+	BPF_CORE_ENUMVAL_EXISTS = 10,        /* enum value existence in target kernel */
+	BPF_CORE_ENUMVAL_VALUE = 11,         /* enum value integer value */
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 211b43afd0fb..9e66b1880020 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6374,4 +6374,23 @@ enum {
 	BTF_F_ZERO	=	(1ULL << 3),
 };
 
+/* bpf_core_relo_kind encodes which aspect of captured field/type/enum value
+ * has to be adjusted by relocations. It is emitted by llvm and passed to
+ * libbpf and later to the kernel.
+ */
+enum bpf_core_relo_kind {
+	BPF_CORE_FIELD_BYTE_OFFSET = 0,      /* field byte offset */
+	BPF_CORE_FIELD_BYTE_SIZE = 1,        /* field size in bytes */
+	BPF_CORE_FIELD_EXISTS = 2,           /* field existence in target kernel */
+	BPF_CORE_FIELD_SIGNED = 3,           /* field signedness (0 - unsigned, 1 - signed) */
+	BPF_CORE_FIELD_LSHIFT_U64 = 4,       /* bitfield-specific left bitshift */
+	BPF_CORE_FIELD_RSHIFT_U64 = 5,       /* bitfield-specific right bitshift */
+	BPF_CORE_TYPE_ID_LOCAL = 6,          /* type ID in local BPF object */
+	BPF_CORE_TYPE_ID_TARGET = 7,         /* type ID in target kernel */
+	BPF_CORE_TYPE_EXISTS = 8,            /* type existence in target kernel */
+	BPF_CORE_TYPE_SIZE = 9,              /* type size in bytes */
+	BPF_CORE_ENUMVAL_EXISTS = 10,        /* enum value existence in target kernel */
+	BPF_CORE_ENUMVAL_VALUE = 11,         /* enum value integer value */
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 5a2f5a6ae2f9..9eaf2d9820e6 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -5523,7 +5523,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog,
 		return -ENOTSUP;
 	}
 
-	if (relo->kind != BPF_TYPE_ID_LOCAL &&
+	if (relo->kind != BPF_CORE_TYPE_ID_LOCAL &&
 	    !hashmap__find(cand_cache, type_key, (void **)&cands)) {
 		cands = bpf_core_find_cands(prog->obj, local_btf, local_id);
 		if (IS_ERR(cands)) {
diff --git a/tools/lib/bpf/relo_core.c b/tools/lib/bpf/relo_core.c
index 56dbe6d16664..d194fb9306ed 100644
--- a/tools/lib/bpf/relo_core.c
+++ b/tools/lib/bpf/relo_core.c
@@ -113,18 +113,18 @@ static bool is_flex_arr(const struct btf *btf,
 static const char *core_relo_kind_str(enum bpf_core_relo_kind kind)
 {
 	switch (kind) {
-	case BPF_FIELD_BYTE_OFFSET: return "byte_off";
-	case BPF_FIELD_BYTE_SIZE: return "byte_sz";
-	case BPF_FIELD_EXISTS: return "field_exists";
-	case BPF_FIELD_SIGNED: return "signed";
-	case BPF_FIELD_LSHIFT_U64: return "lshift_u64";
-	case BPF_FIELD_RSHIFT_U64: return "rshift_u64";
-	case BPF_TYPE_ID_LOCAL: return "local_type_id";
-	case BPF_TYPE_ID_TARGET: return "target_type_id";
-	case BPF_TYPE_EXISTS: return "type_exists";
-	case BPF_TYPE_SIZE: return "type_size";
-	case BPF_ENUMVAL_EXISTS: return "enumval_exists";
-	case BPF_ENUMVAL_VALUE: return "enumval_value";
+	case BPF_CORE_FIELD_BYTE_OFFSET: return "byte_off";
+	case BPF_CORE_FIELD_BYTE_SIZE: return "byte_sz";
+	case BPF_CORE_FIELD_EXISTS: return "field_exists";
+	case BPF_CORE_FIELD_SIGNED: return "signed";
+	case BPF_CORE_FIELD_LSHIFT_U64: return "lshift_u64";
+	case BPF_CORE_FIELD_RSHIFT_U64: return "rshift_u64";
+	case BPF_CORE_TYPE_ID_LOCAL: return "local_type_id";
+	case BPF_CORE_TYPE_ID_TARGET: return "target_type_id";
+	case BPF_CORE_TYPE_EXISTS: return "type_exists";
+	case BPF_CORE_TYPE_SIZE: return "type_size";
+	case BPF_CORE_ENUMVAL_EXISTS: return "enumval_exists";
+	case BPF_CORE_ENUMVAL_VALUE: return "enumval_value";
 	default: return "unknown";
 	}
 }
@@ -132,12 +132,12 @@ static const char *core_relo_kind_str(enum bpf_core_relo_kind kind)
 static bool core_relo_is_field_based(enum bpf_core_relo_kind kind)
 {
 	switch (kind) {
-	case BPF_FIELD_BYTE_OFFSET:
-	case BPF_FIELD_BYTE_SIZE:
-	case BPF_FIELD_EXISTS:
-	case BPF_FIELD_SIGNED:
-	case BPF_FIELD_LSHIFT_U64:
-	case BPF_FIELD_RSHIFT_U64:
+	case BPF_CORE_FIELD_BYTE_OFFSET:
+	case BPF_CORE_FIELD_BYTE_SIZE:
+	case BPF_CORE_FIELD_EXISTS:
+	case BPF_CORE_FIELD_SIGNED:
+	case BPF_CORE_FIELD_LSHIFT_U64:
+	case BPF_CORE_FIELD_RSHIFT_U64:
 		return true;
 	default:
 		return false;
@@ -147,10 +147,10 @@ static bool core_relo_is_field_based(enum bpf_core_relo_kind kind)
 static bool core_relo_is_type_based(enum bpf_core_relo_kind kind)
 {
 	switch (kind) {
-	case BPF_TYPE_ID_LOCAL:
-	case BPF_TYPE_ID_TARGET:
-	case BPF_TYPE_EXISTS:
-	case BPF_TYPE_SIZE:
+	case BPF_CORE_TYPE_ID_LOCAL:
+	case BPF_CORE_TYPE_ID_TARGET:
+	case BPF_CORE_TYPE_EXISTS:
+	case BPF_CORE_TYPE_SIZE:
 		return true;
 	default:
 		return false;
@@ -160,8 +160,8 @@ static bool core_relo_is_type_based(enum bpf_core_relo_kind kind)
 static bool core_relo_is_enumval_based(enum bpf_core_relo_kind kind)
 {
 	switch (kind) {
-	case BPF_ENUMVAL_EXISTS:
-	case BPF_ENUMVAL_VALUE:
+	case BPF_CORE_ENUMVAL_EXISTS:
+	case BPF_CORE_ENUMVAL_VALUE:
 		return true;
 	default:
 		return false;
@@ -624,7 +624,7 @@ static int bpf_core_calc_field_relo(const char *prog_name,
 
 	*field_sz = 0;
 
-	if (relo->kind == BPF_FIELD_EXISTS) {
+	if (relo->kind == BPF_CORE_FIELD_EXISTS) {
 		*val = spec ? 1 : 0;
 		return 0;
 	}
@@ -637,7 +637,7 @@ static int bpf_core_calc_field_relo(const char *prog_name,
 
 	/* a[n] accessor needs special handling */
 	if (!acc->name) {
-		if (relo->kind == BPF_FIELD_BYTE_OFFSET) {
+		if (relo->kind == BPF_CORE_FIELD_BYTE_OFFSET) {
 			*val = spec->bit_offset / 8;
 			/* remember field size for load/store mem size */
 			sz = btf__resolve_size(spec->btf, acc->type_id);
@@ -645,7 +645,7 @@ static int bpf_core_calc_field_relo(const char *prog_name,
 				return -EINVAL;
 			*field_sz = sz;
 			*type_id = acc->type_id;
-		} else if (relo->kind == BPF_FIELD_BYTE_SIZE) {
+		} else if (relo->kind == BPF_CORE_FIELD_BYTE_SIZE) {
 			sz = btf__resolve_size(spec->btf, acc->type_id);
 			if (sz < 0)
 				return -EINVAL;
@@ -697,36 +697,36 @@ static int bpf_core_calc_field_relo(const char *prog_name,
 		*validate = !bitfield;
 
 	switch (relo->kind) {
-	case BPF_FIELD_BYTE_OFFSET:
+	case BPF_CORE_FIELD_BYTE_OFFSET:
 		*val = byte_off;
 		if (!bitfield) {
 			*field_sz = byte_sz;
 			*type_id = field_type_id;
 		}
 		break;
-	case BPF_FIELD_BYTE_SIZE:
+	case BPF_CORE_FIELD_BYTE_SIZE:
 		*val = byte_sz;
 		break;
-	case BPF_FIELD_SIGNED:
+	case BPF_CORE_FIELD_SIGNED:
 		/* enums will be assumed unsigned */
 		*val = btf_is_enum(mt) ||
 		       (btf_int_encoding(mt) & BTF_INT_SIGNED);
 		if (validate)
 			*validate = true; /* signedness is never ambiguous */
 		break;
-	case BPF_FIELD_LSHIFT_U64:
+	case BPF_CORE_FIELD_LSHIFT_U64:
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 		*val = 64 - (bit_off + bit_sz - byte_off  * 8);
 #else
 		*val = (8 - byte_sz) * 8 + (bit_off - byte_off * 8);
 #endif
 		break;
-	case BPF_FIELD_RSHIFT_U64:
+	case BPF_CORE_FIELD_RSHIFT_U64:
 		*val = 64 - bit_sz;
 		if (validate)
 			*validate = true; /* right shift is never ambiguous */
 		break;
-	case BPF_FIELD_EXISTS:
+	case BPF_CORE_FIELD_EXISTS:
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -747,20 +747,20 @@ static int bpf_core_calc_type_relo(const struct bpf_core_relo *relo,
 	}
 
 	switch (relo->kind) {
-	case BPF_TYPE_ID_TARGET:
+	case BPF_CORE_TYPE_ID_TARGET:
 		*val = spec->root_type_id;
 		break;
-	case BPF_TYPE_EXISTS:
+	case BPF_CORE_TYPE_EXISTS:
 		*val = 1;
 		break;
-	case BPF_TYPE_SIZE:
+	case BPF_CORE_TYPE_SIZE:
 		sz = btf__resolve_size(spec->btf, spec->root_type_id);
 		if (sz < 0)
 			return -EINVAL;
 		*val = sz;
 		break;
-	case BPF_TYPE_ID_LOCAL:
-	/* BPF_TYPE_ID_LOCAL is handled specially and shouldn't get here */
+	case BPF_CORE_TYPE_ID_LOCAL:
+	/* BPF_CORE_TYPE_ID_LOCAL is handled specially and shouldn't get here */
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -776,10 +776,10 @@ static int bpf_core_calc_enumval_relo(const struct bpf_core_relo *relo,
 	const struct btf_enum *e;
 
 	switch (relo->kind) {
-	case BPF_ENUMVAL_EXISTS:
+	case BPF_CORE_ENUMVAL_EXISTS:
 		*val = spec ? 1 : 0;
 		break;
-	case BPF_ENUMVAL_VALUE:
+	case BPF_CORE_ENUMVAL_VALUE:
 		if (!spec)
 			return -EUCLEAN; /* request instruction poisoning */
 		t = btf_type_by_id(spec->btf, spec->spec[0].type_id);
@@ -1236,7 +1236,7 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn,
 	libbpf_print(LIBBPF_DEBUG, "\n");
 
 	/* TYPE_ID_LOCAL relo is special and doesn't need candidate search */
-	if (relo->kind == BPF_TYPE_ID_LOCAL) {
+	if (relo->kind == BPF_CORE_TYPE_ID_LOCAL) {
 		targ_res.validate = true;
 		targ_res.poison = false;
 		targ_res.orig_val = local_spec.root_type_id;
@@ -1302,7 +1302,7 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn,
 	}
 
 	/*
-	 * For BPF_FIELD_EXISTS relo or when used BPF program has field
+	 * For BPF_CORE_FIELD_EXISTS relo or when used BPF program has field
 	 * existence checks or kernel version/config checks, it's expected
 	 * that we might not find any candidates. In this case, if field
 	 * wasn't found in any candidate, the list of candidates shouldn't
diff --git a/tools/lib/bpf/relo_core.h b/tools/lib/bpf/relo_core.h
index 3b9f8f18346c..3d0b86e7f439 100644
--- a/tools/lib/bpf/relo_core.h
+++ b/tools/lib/bpf/relo_core.h
@@ -4,23 +4,7 @@
 #ifndef __RELO_CORE_H
 #define __RELO_CORE_H
 
-/* bpf_core_relo_kind encodes which aspect of captured field/type/enum value
- * has to be adjusted by relocations.
- */
-enum bpf_core_relo_kind {
-	BPF_FIELD_BYTE_OFFSET = 0,	/* field byte offset */
-	BPF_FIELD_BYTE_SIZE = 1,	/* field size in bytes */
-	BPF_FIELD_EXISTS = 2,		/* field existence in target kernel */
-	BPF_FIELD_SIGNED = 3,		/* field signedness (0 - unsigned, 1 - signed) */
-	BPF_FIELD_LSHIFT_U64 = 4,	/* bitfield-specific left bitshift */
-	BPF_FIELD_RSHIFT_U64 = 5,	/* bitfield-specific right bitshift */
-	BPF_TYPE_ID_LOCAL = 6,		/* type ID in local BPF object */
-	BPF_TYPE_ID_TARGET = 7,		/* type ID in target kernel */
-	BPF_TYPE_EXISTS = 8,		/* type existence in target kernel */
-	BPF_TYPE_SIZE = 9,		/* type size in bytes */
-	BPF_ENUMVAL_EXISTS = 10,	/* enum value existence in target kernel */
-	BPF_ENUMVAL_VALUE = 11,		/* enum value integer value */
-};
+#include <linux/bpf.h>
 
 /* The minimum bpf_core_relo checked by the loader
  *
-- 
cgit v1.2.3


From fbd94c7afcf99c9f3b1ba1168657ecc428eb2c8d Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 1 Dec 2021 10:10:28 -0800
Subject: bpf: Pass a set of bpf_core_relo-s to prog_load command.

struct bpf_core_relo is generated by llvm and processed by libbpf.
It's a de-facto uapi.
With CO-RE in the kernel the struct bpf_core_relo becomes uapi de-jure.
Add an ability to pass a set of 'struct bpf_core_relo' to prog_load command
and let the kernel perform CO-RE relocations.

Note the struct bpf_line_info and struct bpf_func_info have the same
layout when passed from LLVM to libbpf and from libbpf to the kernel
except "insn_off" fields means "byte offset" when LLVM generates it.
Then libbpf converts it to "insn index" to pass to the kernel.
The struct bpf_core_relo's "insn_off" field is always "byte offset".

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211201181040.23337-6-alexei.starovoitov@gmail.com
---
 include/linux/bpf.h            |  8 +++++
 include/uapi/linux/bpf.h       | 59 +++++++++++++++++++++++++++++++-
 kernel/bpf/btf.c               |  6 ++++
 kernel/bpf/syscall.c           |  2 +-
 kernel/bpf/verifier.c          | 76 ++++++++++++++++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h | 59 +++++++++++++++++++++++++++++++-
 tools/lib/bpf/relo_core.h      | 53 -----------------------------
 7 files changed, 207 insertions(+), 56 deletions(-)

(limited to 'tools')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cad0829710be..8bbf08fbab66 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1732,6 +1732,14 @@ bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog);
 const struct btf_func_model *
 bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
 			 const struct bpf_insn *insn);
+struct bpf_core_ctx {
+	struct bpf_verifier_log *log;
+	const struct btf *btf;
+};
+
+int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
+		   int relo_idx, void *insn);
+
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 9e66b1880020..c26871263f1f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1342,8 +1342,10 @@ union bpf_attr {
 			/* or valid module BTF object fd or 0 to attach to vmlinux */
 			__u32		attach_btf_obj_fd;
 		};
-		__u32		:32;		/* pad */
+		__u32		core_relo_cnt;	/* number of bpf_core_relo */
 		__aligned_u64	fd_array;	/* array of FDs */
+		__aligned_u64	core_relos;
+		__u32		core_relo_rec_size; /* sizeof(struct bpf_core_relo) */
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -6393,4 +6395,59 @@ enum bpf_core_relo_kind {
 	BPF_CORE_ENUMVAL_VALUE = 11,         /* enum value integer value */
 };
 
+/*
+ * "struct bpf_core_relo" is used to pass relocation data form LLVM to libbpf
+ * and from libbpf to the kernel.
+ *
+ * CO-RE relocation captures the following data:
+ * - insn_off - instruction offset (in bytes) within a BPF program that needs
+ *   its insn->imm field to be relocated with actual field info;
+ * - type_id - BTF type ID of the "root" (containing) entity of a relocatable
+ *   type or field;
+ * - access_str_off - offset into corresponding .BTF string section. String
+ *   interpretation depends on specific relocation kind:
+ *     - for field-based relocations, string encodes an accessed field using
+ *       a sequence of field and array indices, separated by colon (:). It's
+ *       conceptually very close to LLVM's getelementptr ([0]) instruction's
+ *       arguments for identifying offset to a field.
+ *     - for type-based relocations, strings is expected to be just "0";
+ *     - for enum value-based relocations, string contains an index of enum
+ *       value within its enum type;
+ * - kind - one of enum bpf_core_relo_kind;
+ *
+ * Example:
+ *   struct sample {
+ *       int a;
+ *       struct {
+ *           int b[10];
+ *       };
+ *   };
+ *
+ *   struct sample *s = ...;
+ *   int *x = &s->a;     // encoded as "0:0" (a is field #0)
+ *   int *y = &s->b[5];  // encoded as "0:1:0:5" (anon struct is field #1,
+ *                       // b is field #0 inside anon struct, accessing elem #5)
+ *   int *z = &s[10]->b; // encoded as "10:1" (ptr is used as an array)
+ *
+ * type_id for all relocs in this example will capture BTF type id of
+ * `struct sample`.
+ *
+ * Such relocation is emitted when using __builtin_preserve_access_index()
+ * Clang built-in, passing expression that captures field address, e.g.:
+ *
+ * bpf_probe_read(&dst, sizeof(dst),
+ *		  __builtin_preserve_access_index(&src->a.b.c));
+ *
+ * In this case Clang will emit field relocation recording necessary data to
+ * be able to find offset of embedded `a.b.c` field within `src` struct.
+ *
+ * [0] https://llvm.org/docs/LangRef.html#getelementptr-instruction
+ */
+struct bpf_core_relo {
+	__u32 insn_off;
+	__u32 type_id;
+	__u32 access_str_off;
+	enum bpf_core_relo_kind kind;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index c79595aad55b..0d070461e2b8 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6439,3 +6439,9 @@ size_t bpf_core_essential_name_len(const char *name)
 	}
 	return n;
 }
+
+int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
+		   int relo_idx, void *insn)
+{
+	return -EOPNOTSUPP;
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 47089d1d67a4..b3ada4085f85 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2184,7 +2184,7 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
 }
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD fd_array
+#define	BPF_PROG_LOAD_LAST_FIELD core_relo_rec_size
 
 static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
 {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6c9c0d9a04a0..6522ffdea487 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -10273,6 +10273,78 @@ err_free:
 	return err;
 }
 
+#define MIN_CORE_RELO_SIZE	sizeof(struct bpf_core_relo)
+#define MAX_CORE_RELO_SIZE	MAX_FUNCINFO_REC_SIZE
+
+static int check_core_relo(struct bpf_verifier_env *env,
+			   const union bpf_attr *attr,
+			   bpfptr_t uattr)
+{
+	u32 i, nr_core_relo, ncopy, expected_size, rec_size;
+	struct bpf_core_relo core_relo = {};
+	struct bpf_prog *prog = env->prog;
+	const struct btf *btf = prog->aux->btf;
+	struct bpf_core_ctx ctx = {
+		.log = &env->log,
+		.btf = btf,
+	};
+	bpfptr_t u_core_relo;
+	int err;
+
+	nr_core_relo = attr->core_relo_cnt;
+	if (!nr_core_relo)
+		return 0;
+	if (nr_core_relo > INT_MAX / sizeof(struct bpf_core_relo))
+		return -EINVAL;
+
+	rec_size = attr->core_relo_rec_size;
+	if (rec_size < MIN_CORE_RELO_SIZE ||
+	    rec_size > MAX_CORE_RELO_SIZE ||
+	    rec_size % sizeof(u32))
+		return -EINVAL;
+
+	u_core_relo = make_bpfptr(attr->core_relos, uattr.is_kernel);
+	expected_size = sizeof(struct bpf_core_relo);
+	ncopy = min_t(u32, expected_size, rec_size);
+
+	/* Unlike func_info and line_info, copy and apply each CO-RE
+	 * relocation record one at a time.
+	 */
+	for (i = 0; i < nr_core_relo; i++) {
+		/* future proofing when sizeof(bpf_core_relo) changes */
+		err = bpf_check_uarg_tail_zero(u_core_relo, expected_size, rec_size);
+		if (err) {
+			if (err == -E2BIG) {
+				verbose(env, "nonzero tailing record in core_relo");
+				if (copy_to_bpfptr_offset(uattr,
+							  offsetof(union bpf_attr, core_relo_rec_size),
+							  &expected_size, sizeof(expected_size)))
+					err = -EFAULT;
+			}
+			break;
+		}
+
+		if (copy_from_bpfptr(&core_relo, u_core_relo, ncopy)) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (core_relo.insn_off % 8 || core_relo.insn_off / 8 >= prog->len) {
+			verbose(env, "Invalid core_relo[%u].insn_off:%u prog->len:%u\n",
+				i, core_relo.insn_off, prog->len);
+			err = -EINVAL;
+			break;
+		}
+
+		err = bpf_core_apply(&ctx, &core_relo, i,
+				     &prog->insnsi[core_relo.insn_off / 8]);
+		if (err)
+			break;
+		bpfptr_add(&u_core_relo, rec_size);
+	}
+	return err;
+}
+
 static int check_btf_info(struct bpf_verifier_env *env,
 			  const union bpf_attr *attr,
 			  bpfptr_t uattr)
@@ -10303,6 +10375,10 @@ static int check_btf_info(struct bpf_verifier_env *env,
 	if (err)
 		return err;
 
+	err = check_core_relo(env, attr, uattr);
+	if (err)
+		return err;
+
 	return 0;
 }
 
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 9e66b1880020..c26871263f1f 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1342,8 +1342,10 @@ union bpf_attr {
 			/* or valid module BTF object fd or 0 to attach to vmlinux */
 			__u32		attach_btf_obj_fd;
 		};
-		__u32		:32;		/* pad */
+		__u32		core_relo_cnt;	/* number of bpf_core_relo */
 		__aligned_u64	fd_array;	/* array of FDs */
+		__aligned_u64	core_relos;
+		__u32		core_relo_rec_size; /* sizeof(struct bpf_core_relo) */
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -6393,4 +6395,59 @@ enum bpf_core_relo_kind {
 	BPF_CORE_ENUMVAL_VALUE = 11,         /* enum value integer value */
 };
 
+/*
+ * "struct bpf_core_relo" is used to pass relocation data form LLVM to libbpf
+ * and from libbpf to the kernel.
+ *
+ * CO-RE relocation captures the following data:
+ * - insn_off - instruction offset (in bytes) within a BPF program that needs
+ *   its insn->imm field to be relocated with actual field info;
+ * - type_id - BTF type ID of the "root" (containing) entity of a relocatable
+ *   type or field;
+ * - access_str_off - offset into corresponding .BTF string section. String
+ *   interpretation depends on specific relocation kind:
+ *     - for field-based relocations, string encodes an accessed field using
+ *       a sequence of field and array indices, separated by colon (:). It's
+ *       conceptually very close to LLVM's getelementptr ([0]) instruction's
+ *       arguments for identifying offset to a field.
+ *     - for type-based relocations, strings is expected to be just "0";
+ *     - for enum value-based relocations, string contains an index of enum
+ *       value within its enum type;
+ * - kind - one of enum bpf_core_relo_kind;
+ *
+ * Example:
+ *   struct sample {
+ *       int a;
+ *       struct {
+ *           int b[10];
+ *       };
+ *   };
+ *
+ *   struct sample *s = ...;
+ *   int *x = &s->a;     // encoded as "0:0" (a is field #0)
+ *   int *y = &s->b[5];  // encoded as "0:1:0:5" (anon struct is field #1,
+ *                       // b is field #0 inside anon struct, accessing elem #5)
+ *   int *z = &s[10]->b; // encoded as "10:1" (ptr is used as an array)
+ *
+ * type_id for all relocs in this example will capture BTF type id of
+ * `struct sample`.
+ *
+ * Such relocation is emitted when using __builtin_preserve_access_index()
+ * Clang built-in, passing expression that captures field address, e.g.:
+ *
+ * bpf_probe_read(&dst, sizeof(dst),
+ *		  __builtin_preserve_access_index(&src->a.b.c));
+ *
+ * In this case Clang will emit field relocation recording necessary data to
+ * be able to find offset of embedded `a.b.c` field within `src` struct.
+ *
+ * [0] https://llvm.org/docs/LangRef.html#getelementptr-instruction
+ */
+struct bpf_core_relo {
+	__u32 insn_off;
+	__u32 type_id;
+	__u32 access_str_off;
+	enum bpf_core_relo_kind kind;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/tools/lib/bpf/relo_core.h b/tools/lib/bpf/relo_core.h
index 3d0b86e7f439..f410691cc4e5 100644
--- a/tools/lib/bpf/relo_core.h
+++ b/tools/lib/bpf/relo_core.h
@@ -6,59 +6,6 @@
 
 #include <linux/bpf.h>
 
-/* The minimum bpf_core_relo checked by the loader
- *
- * CO-RE relocation captures the following data:
- * - insn_off - instruction offset (in bytes) within a BPF program that needs
- *   its insn->imm field to be relocated with actual field info;
- * - type_id - BTF type ID of the "root" (containing) entity of a relocatable
- *   type or field;
- * - access_str_off - offset into corresponding .BTF string section. String
- *   interpretation depends on specific relocation kind:
- *     - for field-based relocations, string encodes an accessed field using
- *     a sequence of field and array indices, separated by colon (:). It's
- *     conceptually very close to LLVM's getelementptr ([0]) instruction's
- *     arguments for identifying offset to a field.
- *     - for type-based relocations, strings is expected to be just "0";
- *     - for enum value-based relocations, string contains an index of enum
- *     value within its enum type;
- *
- * Example to provide a better feel.
- *
- *   struct sample {
- *       int a;
- *       struct {
- *           int b[10];
- *       };
- *   };
- *
- *   struct sample *s = ...;
- *   int x = &s->a;     // encoded as "0:0" (a is field #0)
- *   int y = &s->b[5];  // encoded as "0:1:0:5" (anon struct is field #1,
- *                      // b is field #0 inside anon struct, accessing elem #5)
- *   int z = &s[10]->b; // encoded as "10:1" (ptr is used as an array)
- *
- * type_id for all relocs in this example  will capture BTF type id of
- * `struct sample`.
- *
- * Such relocation is emitted when using __builtin_preserve_access_index()
- * Clang built-in, passing expression that captures field address, e.g.:
- *
- * bpf_probe_read(&dst, sizeof(dst),
- *		  __builtin_preserve_access_index(&src->a.b.c));
- *
- * In this case Clang will emit field relocation recording necessary data to
- * be able to find offset of embedded `a.b.c` field within `src` struct.
- *
- *   [0] https://llvm.org/docs/LangRef.html#getelementptr-instruction
- */
-struct bpf_core_relo {
-	__u32   insn_off;
-	__u32   type_id;
-	__u32   access_str_off;
-	enum bpf_core_relo_kind kind;
-};
-
 struct bpf_core_cand {
 	const struct btf *btf;
 	const struct btf_type *t;
-- 
cgit v1.2.3


From 03d5b99138dd8c7bfb838396acb180bd515ebf06 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Dec 2021 10:10:30 -0800
Subject: libbpf: Cleanup struct bpf_core_cand.

Remove two redundant fields from struct bpf_core_cand.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211201181040.23337-8-alexei.starovoitov@gmail.com
---
 tools/lib/bpf/libbpf.c    | 30 +++++++++++++++++-------------
 tools/lib/bpf/relo_core.h |  2 --
 2 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 9eaf2d9820e6..96792d6e6fc1 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -5179,15 +5179,18 @@ static int bpf_core_add_cands(struct bpf_core_cand *local_cand,
 			      struct bpf_core_cand_list *cands)
 {
 	struct bpf_core_cand *new_cands, *cand;
-	const struct btf_type *t;
-	const char *targ_name;
+	const struct btf_type *t, *local_t;
+	const char *targ_name, *local_name;
 	size_t targ_essent_len;
 	int n, i;
 
+	local_t = btf__type_by_id(local_cand->btf, local_cand->id);
+	local_name = btf__str_by_offset(local_cand->btf, local_t->name_off);
+
 	n = btf__type_cnt(targ_btf);
 	for (i = targ_start_id; i < n; i++) {
 		t = btf__type_by_id(targ_btf, i);
-		if (btf_kind(t) != btf_kind(local_cand->t))
+		if (btf_kind(t) != btf_kind(local_t))
 			continue;
 
 		targ_name = btf__name_by_offset(targ_btf, t->name_off);
@@ -5198,12 +5201,12 @@ static int bpf_core_add_cands(struct bpf_core_cand *local_cand,
 		if (targ_essent_len != local_essent_len)
 			continue;
 
-		if (strncmp(local_cand->name, targ_name, local_essent_len) != 0)
+		if (strncmp(local_name, targ_name, local_essent_len) != 0)
 			continue;
 
 		pr_debug("CO-RE relocating [%d] %s %s: found target candidate [%d] %s %s in [%s]\n",
-			 local_cand->id, btf_kind_str(local_cand->t),
-			 local_cand->name, i, btf_kind_str(t), targ_name,
+			 local_cand->id, btf_kind_str(local_t),
+			 local_name, i, btf_kind_str(t), targ_name,
 			 targ_btf_name);
 		new_cands = libbpf_reallocarray(cands->cands, cands->len + 1,
 					      sizeof(*cands->cands));
@@ -5212,8 +5215,6 @@ static int bpf_core_add_cands(struct bpf_core_cand *local_cand,
 
 		cand = &new_cands[cands->len];
 		cand->btf = targ_btf;
-		cand->t = t;
-		cand->name = targ_name;
 		cand->id = i;
 
 		cands->cands = new_cands;
@@ -5320,18 +5321,21 @@ bpf_core_find_cands(struct bpf_object *obj, const struct btf *local_btf, __u32 l
 	struct bpf_core_cand local_cand = {};
 	struct bpf_core_cand_list *cands;
 	const struct btf *main_btf;
+	const struct btf_type *local_t;
+	const char *local_name;
 	size_t local_essent_len;
 	int err, i;
 
 	local_cand.btf = local_btf;
-	local_cand.t = btf__type_by_id(local_btf, local_type_id);
-	if (!local_cand.t)
+	local_cand.id = local_type_id;
+	local_t = btf__type_by_id(local_btf, local_type_id);
+	if (!local_t)
 		return ERR_PTR(-EINVAL);
 
-	local_cand.name = btf__name_by_offset(local_btf, local_cand.t->name_off);
-	if (str_is_empty(local_cand.name))
+	local_name = btf__name_by_offset(local_btf, local_t->name_off);
+	if (str_is_empty(local_name))
 		return ERR_PTR(-EINVAL);
-	local_essent_len = bpf_core_essential_name_len(local_cand.name);
+	local_essent_len = bpf_core_essential_name_len(local_name);
 
 	cands = calloc(1, sizeof(*cands));
 	if (!cands)
diff --git a/tools/lib/bpf/relo_core.h b/tools/lib/bpf/relo_core.h
index f410691cc4e5..4f864b8e33b7 100644
--- a/tools/lib/bpf/relo_core.h
+++ b/tools/lib/bpf/relo_core.h
@@ -8,8 +8,6 @@
 
 struct bpf_core_cand {
 	const struct btf *btf;
-	const struct btf_type *t;
-	const char *name;
 	__u32 id;
 };
 
-- 
cgit v1.2.3


From d0e928876e30b18411b80fd2445424bc00e95745 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 1 Dec 2021 10:10:32 -0800
Subject: libbpf: Use CO-RE in the kernel in light skeleton.

Without lskel the CO-RE relocations are processed by libbpf before any other
work is done. Instead, when lskel is needed, remember relocation as RELO_CORE
kind. Then when loader prog is generated for a given bpf program pass CO-RE
relos of that program to gen loader via bpf_gen__record_relo_core(). The gen
loader will remember them as-is and pass it later as-is into the kernel.

The normal libbpf flow is to process CO-RE early before call relos happen. In
case of gen_loader the core relos have to be added to other relos to be copied
together when bpf static function is appended in different places to other main
bpf progs. During the copy the append_subprog_relos() will adjust insn_idx for
normal relos and for RELO_CORE kind too. When that is done each struct
reloc_desc has good relos for specific main prog.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211201181040.23337-10-alexei.starovoitov@gmail.com
---
 tools/lib/bpf/bpf_gen_internal.h |   3 ++
 tools/lib/bpf/gen_loader.c       |  41 +++++++++++++--
 tools/lib/bpf/libbpf.c           | 109 ++++++++++++++++++++++++++++-----------
 3 files changed, 120 insertions(+), 33 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_gen_internal.h b/tools/lib/bpf/bpf_gen_internal.h
index ae7704deba30..9d57fa84664b 100644
--- a/tools/lib/bpf/bpf_gen_internal.h
+++ b/tools/lib/bpf/bpf_gen_internal.h
@@ -39,6 +39,8 @@ struct bpf_gen {
 	int error;
 	struct ksym_relo_desc *relos;
 	int relo_cnt;
+	struct bpf_core_relo *core_relos;
+	int core_relo_cnt;
 	char attach_target[128];
 	int attach_kind;
 	struct ksym_desc *ksyms;
@@ -64,5 +66,6 @@ void bpf_gen__map_freeze(struct bpf_gen *gen, int map_idx);
 void bpf_gen__record_attach_target(struct bpf_gen *gen, const char *name, enum bpf_attach_type type);
 void bpf_gen__record_extern(struct bpf_gen *gen, const char *name, bool is_weak,
 			    bool is_typeless, int kind, int insn_idx);
+void bpf_gen__record_relo_core(struct bpf_gen *gen, const struct bpf_core_relo *core_relo);
 
 #endif
diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c
index 6f3790369463..87d385e892ab 100644
--- a/tools/lib/bpf/gen_loader.c
+++ b/tools/lib/bpf/gen_loader.c
@@ -829,6 +829,22 @@ clear_src_reg:
 	emit_ksym_relo_log(gen, relo, kdesc->ref);
 }
 
+void bpf_gen__record_relo_core(struct bpf_gen *gen,
+			       const struct bpf_core_relo *core_relo)
+{
+	struct bpf_core_relo *relos;
+
+	relos = libbpf_reallocarray(gen->core_relos, gen->core_relo_cnt + 1, sizeof(*relos));
+	if (!relos) {
+		gen->error = -ENOMEM;
+		return;
+	}
+	gen->core_relos = relos;
+	relos += gen->core_relo_cnt;
+	memcpy(relos, core_relo, sizeof(*relos));
+	gen->core_relo_cnt++;
+}
+
 static void emit_relo(struct bpf_gen *gen, struct ksym_relo_desc *relo, int insns)
 {
 	int insn;
@@ -861,6 +877,15 @@ static void emit_relos(struct bpf_gen *gen, int insns)
 		emit_relo(gen, gen->relos + i, insns);
 }
 
+static void cleanup_core_relo(struct bpf_gen *gen)
+{
+	if (!gen->core_relo_cnt)
+		return;
+	free(gen->core_relos);
+	gen->core_relo_cnt = 0;
+	gen->core_relos = NULL;
+}
+
 static void cleanup_relos(struct bpf_gen *gen, int insns)
 {
 	int i, insn;
@@ -888,6 +913,7 @@ static void cleanup_relos(struct bpf_gen *gen, int insns)
 		gen->relo_cnt = 0;
 		gen->relos = NULL;
 	}
+	cleanup_core_relo(gen);
 }
 
 void bpf_gen__prog_load(struct bpf_gen *gen,
@@ -895,12 +921,13 @@ void bpf_gen__prog_load(struct bpf_gen *gen,
 			const char *license, struct bpf_insn *insns, size_t insn_cnt,
 			struct bpf_prog_load_opts *load_attr, int prog_idx)
 {
-	int attr_size = offsetofend(union bpf_attr, fd_array);
-	int prog_load_attr, license_off, insns_off, func_info, line_info;
+	int prog_load_attr, license_off, insns_off, func_info, line_info, core_relos;
+	int attr_size = offsetofend(union bpf_attr, core_relo_rec_size);
 	union bpf_attr attr;
 
 	memset(&attr, 0, attr_size);
-	pr_debug("gen: prog_load: type %d insns_cnt %zd\n", prog_type, insn_cnt);
+	pr_debug("gen: prog_load: type %d insns_cnt %zd progi_idx %d\n",
+		 prog_type, insn_cnt, prog_idx);
 	/* add license string to blob of bytes */
 	license_off = add_data(gen, license, strlen(license) + 1);
 	/* add insns to blob of bytes */
@@ -924,6 +951,11 @@ void bpf_gen__prog_load(struct bpf_gen *gen,
 	line_info = add_data(gen, load_attr->line_info,
 			     attr.line_info_cnt * attr.line_info_rec_size);
 
+	attr.core_relo_rec_size = sizeof(struct bpf_core_relo);
+	attr.core_relo_cnt = gen->core_relo_cnt;
+	core_relos = add_data(gen, gen->core_relos,
+			     attr.core_relo_cnt * attr.core_relo_rec_size);
+
 	memcpy(attr.prog_name, prog_name,
 	       min((unsigned)strlen(prog_name), BPF_OBJ_NAME_LEN - 1));
 	prog_load_attr = add_data(gen, &attr, attr_size);
@@ -940,6 +972,9 @@ void bpf_gen__prog_load(struct bpf_gen *gen,
 	/* populate union bpf_attr with a pointer to line_info */
 	emit_rel_store(gen, attr_field(prog_load_attr, line_info), line_info);
 
+	/* populate union bpf_attr with a pointer to core_relos */
+	emit_rel_store(gen, attr_field(prog_load_attr, core_relos), core_relos);
+
 	/* populate union bpf_attr fd_array with a pointer to data where map_fds are saved */
 	emit_rel_store(gen, attr_field(prog_load_attr, fd_array), gen->fd_array);
 
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 96792d6e6fc1..831c12e00813 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -230,13 +230,19 @@ enum reloc_type {
 	RELO_EXTERN_VAR,
 	RELO_EXTERN_FUNC,
 	RELO_SUBPROG_ADDR,
+	RELO_CORE,
 };
 
 struct reloc_desc {
 	enum reloc_type type;
 	int insn_idx;
-	int map_idx;
-	int sym_off;
+	union {
+		const struct bpf_core_relo *core_relo; /* used when type == RELO_CORE */
+		struct {
+			int map_idx;
+			int sym_off;
+		};
+	};
 };
 
 struct bpf_sec_def;
@@ -5485,6 +5491,24 @@ static void *u32_as_hash_key(__u32 x)
 	return (void *)(uintptr_t)x;
 }
 
+static int record_relo_core(struct bpf_program *prog,
+			    const struct bpf_core_relo *core_relo, int insn_idx)
+{
+	struct reloc_desc *relos, *relo;
+
+	relos = libbpf_reallocarray(prog->reloc_desc,
+				    prog->nr_reloc + 1, sizeof(*relos));
+	if (!relos)
+		return -ENOMEM;
+	relo = &relos[prog->nr_reloc];
+	relo->type = RELO_CORE;
+	relo->insn_idx = insn_idx;
+	relo->core_relo = core_relo;
+	prog->reloc_desc = relos;
+	prog->nr_reloc++;
+	return 0;
+}
+
 static int bpf_core_apply_relo(struct bpf_program *prog,
 			       const struct bpf_core_relo *relo,
 			       int relo_idx,
@@ -5521,10 +5545,12 @@ static int bpf_core_apply_relo(struct bpf_program *prog,
 		return -EINVAL;
 
 	if (prog->obj->gen_loader) {
-		pr_warn("// TODO core_relo: prog %td insn[%d] %s kind %d\n",
+		const char *spec_str = btf__name_by_offset(local_btf, relo->access_str_off);
+
+		pr_debug("record_relo_core: prog %td insn[%d] %s %s %s final insn_idx %d\n",
 			prog - prog->obj->programs, relo->insn_off / 8,
-			local_name, relo->kind);
-		return -ENOTSUP;
+			btf_kind_str(local_type), local_name, spec_str, insn_idx);
+		return record_relo_core(prog, relo, insn_idx);
 	}
 
 	if (relo->kind != BPF_CORE_TYPE_ID_LOCAL &&
@@ -5729,6 +5755,9 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog)
 		case RELO_CALL:
 			/* handled already */
 			break;
+		case RELO_CORE:
+			/* will be handled by bpf_program_record_relos() */
+			break;
 		default:
 			pr_warn("prog '%s': relo #%d: bad relo type %d\n",
 				prog->name, i, relo->type);
@@ -6169,6 +6198,35 @@ bpf_object__free_relocs(struct bpf_object *obj)
 	}
 }
 
+static int cmp_relocs(const void *_a, const void *_b)
+{
+	const struct reloc_desc *a = _a;
+	const struct reloc_desc *b = _b;
+
+	if (a->insn_idx != b->insn_idx)
+		return a->insn_idx < b->insn_idx ? -1 : 1;
+
+	/* no two relocations should have the same insn_idx, but ... */
+	if (a->type != b->type)
+		return a->type < b->type ? -1 : 1;
+
+	return 0;
+}
+
+static void bpf_object__sort_relos(struct bpf_object *obj)
+{
+	int i;
+
+	for (i = 0; i < obj->nr_programs; i++) {
+		struct bpf_program *p = &obj->programs[i];
+
+		if (!p->nr_reloc)
+			continue;
+
+		qsort(p->reloc_desc, p->nr_reloc, sizeof(*p->reloc_desc), cmp_relocs);
+	}
+}
+
 static int
 bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path)
 {
@@ -6183,6 +6241,8 @@ bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path)
 				err);
 			return err;
 		}
+		if (obj->gen_loader)
+			bpf_object__sort_relos(obj);
 	}
 
 	/* Before relocating calls pre-process relocations and mark
@@ -6387,21 +6447,6 @@ static int bpf_object__collect_map_relos(struct bpf_object *obj,
 	return 0;
 }
 
-static int cmp_relocs(const void *_a, const void *_b)
-{
-	const struct reloc_desc *a = _a;
-	const struct reloc_desc *b = _b;
-
-	if (a->insn_idx != b->insn_idx)
-		return a->insn_idx < b->insn_idx ? -1 : 1;
-
-	/* no two relocations should have the same insn_idx, but ... */
-	if (a->type != b->type)
-		return a->type < b->type ? -1 : 1;
-
-	return 0;
-}
-
 static int bpf_object__collect_relos(struct bpf_object *obj)
 {
 	int i, err;
@@ -6434,14 +6479,7 @@ static int bpf_object__collect_relos(struct bpf_object *obj)
 			return err;
 	}
 
-	for (i = 0; i < obj->nr_programs; i++) {
-		struct bpf_program *p = &obj->programs[i];
-
-		if (!p->nr_reloc)
-			continue;
-
-		qsort(p->reloc_desc, p->nr_reloc, sizeof(*p->reloc_desc), cmp_relocs);
-	}
+	bpf_object__sort_relos(obj);
 	return 0;
 }
 
@@ -6683,7 +6721,7 @@ out:
 	return ret;
 }
 
-static int bpf_program__record_externs(struct bpf_program *prog)
+static int bpf_program_record_relos(struct bpf_program *prog)
 {
 	struct bpf_object *obj = prog->obj;
 	int i;
@@ -6705,6 +6743,17 @@ static int bpf_program__record_externs(struct bpf_program *prog)
 					       ext->is_weak, false, BTF_KIND_FUNC,
 					       relo->insn_idx);
 			break;
+		case RELO_CORE: {
+			struct bpf_core_relo cr = {
+				.insn_off = relo->insn_idx * 8,
+				.type_id = relo->core_relo->type_id,
+				.access_str_off = relo->core_relo->access_str_off,
+				.kind = relo->core_relo->kind,
+			};
+
+			bpf_gen__record_relo_core(obj->gen_loader, &cr);
+			break;
+		}
 		default:
 			continue;
 		}
@@ -6744,7 +6793,7 @@ static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog
 				prog->name, prog->instances.nr);
 		}
 		if (obj->gen_loader)
-			bpf_program__record_externs(prog);
+			bpf_program_record_relos(prog);
 		err = bpf_object_load_prog_instance(obj, prog,
 						    prog->insns, prog->insns_cnt,
 						    license, kern_ver, &fd);
-- 
cgit v1.2.3


From be05c94476f3cf4fdc29feab4ed1053187323296 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 1 Dec 2021 10:10:33 -0800
Subject: libbpf: Support init of inner maps in light skeleton.

Add ability to initialize inner maps in light skeleton.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211201181040.23337-11-alexei.starovoitov@gmail.com
---
 tools/lib/bpf/bpf_gen_internal.h |  1 +
 tools/lib/bpf/gen_loader.c       | 27 +++++++++++++++++++++++++++
 tools/lib/bpf/libbpf.c           |  6 +++---
 3 files changed, 31 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_gen_internal.h b/tools/lib/bpf/bpf_gen_internal.h
index 9d57fa84664b..9dd5f76692ef 100644
--- a/tools/lib/bpf/bpf_gen_internal.h
+++ b/tools/lib/bpf/bpf_gen_internal.h
@@ -67,5 +67,6 @@ void bpf_gen__record_attach_target(struct bpf_gen *gen, const char *name, enum b
 void bpf_gen__record_extern(struct bpf_gen *gen, const char *name, bool is_weak,
 			    bool is_typeless, int kind, int insn_idx);
 void bpf_gen__record_relo_core(struct bpf_gen *gen, const struct bpf_core_relo *core_relo);
+void bpf_gen__populate_outer_map(struct bpf_gen *gen, int outer_map_idx, int key, int inner_map_idx);
 
 #endif
diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c
index 87d385e892ab..ed0e949790da 100644
--- a/tools/lib/bpf/gen_loader.c
+++ b/tools/lib/bpf/gen_loader.c
@@ -1053,6 +1053,33 @@ void bpf_gen__map_update_elem(struct bpf_gen *gen, int map_idx, void *pvalue,
 	emit_check_err(gen);
 }
 
+void bpf_gen__populate_outer_map(struct bpf_gen *gen, int outer_map_idx, int slot,
+				 int inner_map_idx)
+{
+	int attr_size = offsetofend(union bpf_attr, flags);
+	int map_update_attr, key;
+	union bpf_attr attr;
+
+	memset(&attr, 0, attr_size);
+	pr_debug("gen: populate_outer_map: outer %d key %d inner %d\n",
+		 outer_map_idx, slot, inner_map_idx);
+
+	key = add_data(gen, &slot, sizeof(slot));
+
+	map_update_attr = add_data(gen, &attr, attr_size);
+	move_blob2blob(gen, attr_field(map_update_attr, map_fd), 4,
+		       blob_fd_array_off(gen, outer_map_idx));
+	emit_rel_store(gen, attr_field(map_update_attr, key), key);
+	emit_rel_store(gen, attr_field(map_update_attr, value),
+		       blob_fd_array_off(gen, inner_map_idx));
+
+	/* emit MAP_UPDATE_ELEM command */
+	emit_sys_bpf(gen, BPF_MAP_UPDATE_ELEM, map_update_attr, attr_size);
+	debug_ret(gen, "populate_outer_map outer %d key %d inner %d",
+		  outer_map_idx, slot, inner_map_idx);
+	emit_check_err(gen);
+}
+
 void bpf_gen__map_freeze(struct bpf_gen *gen, int map_idx)
 {
 	int attr_size = offsetofend(union bpf_attr, map_fd);
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 831c12e00813..1341ce539662 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -4971,9 +4971,9 @@ static int init_map_in_map_slots(struct bpf_object *obj, struct bpf_map *map)
 		fd = bpf_map__fd(targ_map);
 
 		if (obj->gen_loader) {
-			pr_warn("// TODO map_update_elem: idx %td key %d value==map_idx %td\n",
-				map - obj->maps, i, targ_map - obj->maps);
-			return -ENOTSUP;
+			bpf_gen__populate_outer_map(obj->gen_loader,
+						    map - obj->maps, i,
+						    targ_map - obj->maps);
 		} else {
 			err = bpf_map_update_elem(map->fd, &i, &fd, 0);
 		}
-- 
cgit v1.2.3


From 19250f5fc0c283892a61f3abf9d65e6325f63897 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 1 Dec 2021 10:10:34 -0800
Subject: libbpf: Clean gen_loader's attach kind.

The gen_loader has to clear attach_kind otherwise the programs
without attach_btf_id will fail load if they follow programs
with attach_btf_id.

Fixes: 67234743736a ("libbpf: Generate loader program out of BPF ELF file.")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211201181040.23337-12-alexei.starovoitov@gmail.com
---
 tools/lib/bpf/gen_loader.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c
index ed0e949790da..21dfb930f1d2 100644
--- a/tools/lib/bpf/gen_loader.c
+++ b/tools/lib/bpf/gen_loader.c
@@ -1005,9 +1005,11 @@ void bpf_gen__prog_load(struct bpf_gen *gen,
 	debug_ret(gen, "prog_load %s insn_cnt %d", attr.prog_name, attr.insn_cnt);
 	/* successful or not, close btf module FDs used in extern ksyms and attach_btf_obj_fd */
 	cleanup_relos(gen, insns_off);
-	if (gen->attach_kind)
+	if (gen->attach_kind) {
 		emit_sys_close_blob(gen,
 				    attr_field(prog_load_attr, attach_btf_obj_fd));
+		gen->attach_kind = 0;
+	}
 	emit_check_err(gen);
 	/* remember prog_fd in the stack, if successful */
 	emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_7,
-- 
cgit v1.2.3


From bc5f75da977b2a4d9aa6827081e6c2ddd3347328 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 1 Dec 2021 10:10:35 -0800
Subject: selftests/bpf: Add lskel version of kfunc test.

Add light skeleton version of kfunc_call_test_subprog test.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211201181040.23337-13-alexei.starovoitov@gmail.com
---
 tools/testing/selftests/bpf/Makefile               |  2 +-
 .../testing/selftests/bpf/prog_tests/kfunc_call.c  | 24 ++++++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index a6c0e92c86a1..6046f86841cd 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -327,7 +327,7 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h		\
 LSKELS := kfunc_call_test.c fentry_test.c fexit_test.c fexit_sleep.c \
 	test_ringbuf.c atomics.c trace_printk.c trace_vprintk.c
 # Generate both light skeleton and libbpf skeleton for these
-LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c
+LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c kfunc_call_test_subprog.c
 SKEL_BLACKLIST += $$(LSKELS)
 
 test_static_linked.skel.h-deps := test_static_linked1.o test_static_linked2.o
diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
index 5c9c0176991b..7d7445ccc141 100644
--- a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
+++ b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
@@ -4,6 +4,7 @@
 #include <network_helpers.h>
 #include "kfunc_call_test.lskel.h"
 #include "kfunc_call_test_subprog.skel.h"
+#include "kfunc_call_test_subprog.lskel.h"
 
 static void test_main(void)
 {
@@ -49,6 +50,26 @@ static void test_subprog(void)
 	kfunc_call_test_subprog__destroy(skel);
 }
 
+static void test_subprog_lskel(void)
+{
+	struct kfunc_call_test_subprog_lskel *skel;
+	int prog_fd, retval, err;
+
+	skel = kfunc_call_test_subprog_lskel__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel"))
+		return;
+
+	prog_fd = skel->progs.kfunc_call_test1.prog_fd;
+	err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
+				NULL, NULL, (__u32 *)&retval, NULL);
+	ASSERT_OK(err, "bpf_prog_test_run(test1)");
+	ASSERT_EQ(retval, 10, "test1-retval");
+	ASSERT_NEQ(skel->data->active_res, -1, "active_res");
+	ASSERT_EQ(skel->data->sk_state_res, BPF_TCP_CLOSE, "sk_state_res");
+
+	kfunc_call_test_subprog_lskel__destroy(skel);
+}
+
 void test_kfunc_call(void)
 {
 	if (test__start_subtest("main"))
@@ -56,4 +77,7 @@ void test_kfunc_call(void)
 
 	if (test__start_subtest("subprog"))
 		test_subprog();
+
+	if (test__start_subtest("subprog_lskel"))
+		test_subprog_lskel();
 }
-- 
cgit v1.2.3


From d82fa9b708d7d8a9c275d86c4388d24ecc63206c Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 1 Dec 2021 10:10:36 -0800
Subject: selftests/bpf: Improve inner_map test coverage.

Check that hash and array inner maps are properly initialized.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211201181040.23337-14-alexei.starovoitov@gmail.com
---
 tools/testing/selftests/bpf/progs/map_ptr_kern.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/map_ptr_kern.c b/tools/testing/selftests/bpf/progs/map_ptr_kern.c
index b1b711d9b214..b64df94ec476 100644
--- a/tools/testing/selftests/bpf/progs/map_ptr_kern.c
+++ b/tools/testing/selftests/bpf/progs/map_ptr_kern.c
@@ -334,9 +334,11 @@ static inline int check_lpm_trie(void)
 	return 1;
 }
 
+#define INNER_MAX_ENTRIES 1234
+
 struct inner_map {
 	__uint(type, BPF_MAP_TYPE_ARRAY);
-	__uint(max_entries, 1);
+	__uint(max_entries, INNER_MAX_ENTRIES);
 	__type(key, __u32);
 	__type(value, __u32);
 } inner_map SEC(".maps");
@@ -348,7 +350,7 @@ struct {
 	__type(value, __u32);
 	__array(values, struct {
 		__uint(type, BPF_MAP_TYPE_ARRAY);
-		__uint(max_entries, 1);
+		__uint(max_entries, INNER_MAX_ENTRIES);
 		__type(key, __u32);
 		__type(value, __u32);
 	});
@@ -360,8 +362,13 @@ static inline int check_array_of_maps(void)
 {
 	struct bpf_array *array_of_maps = (struct bpf_array *)&m_array_of_maps;
 	struct bpf_map *map = (struct bpf_map *)&m_array_of_maps;
+	struct bpf_array *inner_map;
+	int key = 0;
 
 	VERIFY(check_default(&array_of_maps->map, map));
+	inner_map = bpf_map_lookup_elem(array_of_maps, &key);
+	VERIFY(inner_map != 0);
+	VERIFY(inner_map->map.max_entries == INNER_MAX_ENTRIES);
 
 	return 1;
 }
@@ -382,8 +389,13 @@ static inline int check_hash_of_maps(void)
 {
 	struct bpf_htab *hash_of_maps = (struct bpf_htab *)&m_hash_of_maps;
 	struct bpf_map *map = (struct bpf_map *)&m_hash_of_maps;
+	struct bpf_htab *inner_map;
+	int key = 2;
 
 	VERIFY(check_default(&hash_of_maps->map, map));
+	inner_map = bpf_map_lookup_elem(hash_of_maps, &key);
+	VERIFY(inner_map != 0);
+	VERIFY(inner_map->map.max_entries == INNER_MAX_ENTRIES);
 
 	return 1;
 }
-- 
cgit v1.2.3


From 650c9dbd101ba7d7180f4e77deb1c273f4ea5ca3 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 1 Dec 2021 10:10:37 -0800
Subject: selftests/bpf: Convert map_ptr_kern test to use light skeleton.

To exercise CO-RE in the kernel further convert map_ptr_kern
test to light skeleton.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211201181040.23337-15-alexei.starovoitov@gmail.com
---
 tools/testing/selftests/bpf/Makefile             |  3 ++-
 tools/testing/selftests/bpf/prog_tests/map_ptr.c | 16 +++++++---------
 2 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 6046f86841cd..200ebcc73651 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -325,7 +325,8 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h		\
 		linked_vars.skel.h linked_maps.skel.h
 
 LSKELS := kfunc_call_test.c fentry_test.c fexit_test.c fexit_sleep.c \
-	test_ringbuf.c atomics.c trace_printk.c trace_vprintk.c
+	test_ringbuf.c atomics.c trace_printk.c trace_vprintk.c \
+	map_ptr_kern.c
 # Generate both light skeleton and libbpf skeleton for these
 LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c kfunc_call_test_subprog.c
 SKEL_BLACKLIST += $$(LSKELS)
diff --git a/tools/testing/selftests/bpf/prog_tests/map_ptr.c b/tools/testing/selftests/bpf/prog_tests/map_ptr.c
index 4972f92205c7..273725504f11 100644
--- a/tools/testing/selftests/bpf/prog_tests/map_ptr.c
+++ b/tools/testing/selftests/bpf/prog_tests/map_ptr.c
@@ -4,31 +4,29 @@
 #include <test_progs.h>
 #include <network_helpers.h>
 
-#include "map_ptr_kern.skel.h"
+#include "map_ptr_kern.lskel.h"
 
 void test_map_ptr(void)
 {
-	struct map_ptr_kern *skel;
+	struct map_ptr_kern_lskel *skel;
 	__u32 duration = 0, retval;
 	char buf[128];
 	int err;
 	int page_size = getpagesize();
 
-	skel = map_ptr_kern__open();
+	skel = map_ptr_kern_lskel__open();
 	if (!ASSERT_OK_PTR(skel, "skel_open"))
 		return;
 
-	err = bpf_map__set_max_entries(skel->maps.m_ringbuf, page_size);
-	if (!ASSERT_OK(err, "bpf_map__set_max_entries"))
-		goto cleanup;
+	skel->maps.m_ringbuf.max_entries = page_size;
 
-	err = map_ptr_kern__load(skel);
+	err = map_ptr_kern_lskel__load(skel);
 	if (!ASSERT_OK(err, "skel_load"))
 		goto cleanup;
 
 	skel->bss->page_size = page_size;
 
-	err = bpf_prog_test_run(bpf_program__fd(skel->progs.cg_skb), 1, &pkt_v4,
+	err = bpf_prog_test_run(skel->progs.cg_skb.prog_fd, 1, &pkt_v4,
 				sizeof(pkt_v4), buf, NULL, &retval, NULL);
 
 	if (CHECK(err, "test_run", "err=%d errno=%d\n", err, errno))
@@ -39,5 +37,5 @@ void test_map_ptr(void)
 		goto cleanup;
 
 cleanup:
-	map_ptr_kern__destroy(skel);
+	map_ptr_kern_lskel__destroy(skel);
 }
-- 
cgit v1.2.3


From 26b367e3663931f2fee5f0786a1eff712e67b0bf Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 1 Dec 2021 10:10:38 -0800
Subject: selftests/bpf: Additional test for CO-RE in the kernel.

Add a test where randmap() function is appended to three different bpf
programs. That action checks struct bpf_core_relo replication logic
and offset adjustment in gen loader part of libbpf.

Fourth bpf program has 360 CO-RE relocations from vmlinux, bpf_testmod,
and non-existing type. It tests candidate cache logic.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211201181040.23337-16-alexei.starovoitov@gmail.com
---
 tools/testing/selftests/bpf/Makefile               |   2 +-
 tools/testing/selftests/bpf/prog_tests/core_kern.c |  14 +++
 tools/testing/selftests/bpf/progs/core_kern.c      | 104 +++++++++++++++++++++
 3 files changed, 119 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/core_kern.c
 create mode 100644 tools/testing/selftests/bpf/progs/core_kern.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 200ebcc73651..8981369b071b 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -326,7 +326,7 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h		\
 
 LSKELS := kfunc_call_test.c fentry_test.c fexit_test.c fexit_sleep.c \
 	test_ringbuf.c atomics.c trace_printk.c trace_vprintk.c \
-	map_ptr_kern.c
+	map_ptr_kern.c core_kern.c
 # Generate both light skeleton and libbpf skeleton for these
 LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c kfunc_call_test_subprog.c
 SKEL_BLACKLIST += $$(LSKELS)
diff --git a/tools/testing/selftests/bpf/prog_tests/core_kern.c b/tools/testing/selftests/bpf/prog_tests/core_kern.c
new file mode 100644
index 000000000000..561c5185d886
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/core_kern.c
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+
+#include "test_progs.h"
+#include "core_kern.lskel.h"
+
+void test_core_kern_lskel(void)
+{
+	struct core_kern_lskel *skel;
+
+	skel = core_kern_lskel__open_and_load();
+	ASSERT_OK_PTR(skel, "open_and_load");
+	core_kern_lskel__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/core_kern.c b/tools/testing/selftests/bpf/progs/core_kern.c
new file mode 100644
index 000000000000..13499cc15c7d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/core_kern.c
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include "vmlinux.h"
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+#define ATTR __always_inline
+#include "test_jhash.h"
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, u32);
+	__type(value, u32);
+	__uint(max_entries, 256);
+} array1 SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, u32);
+	__type(value, u32);
+	__uint(max_entries, 256);
+} array2 SEC(".maps");
+
+static __noinline int randmap(int v, const struct net_device *dev)
+{
+	struct bpf_map *map = (struct bpf_map *)&array1;
+	int key = bpf_get_prandom_u32() & 0xff;
+	int *val;
+
+	if (bpf_get_prandom_u32() & 1)
+		map = (struct bpf_map *)&array2;
+
+	val = bpf_map_lookup_elem(map, &key);
+	if (val)
+		*val = bpf_get_prandom_u32() + v + dev->mtu;
+
+	return 0;
+}
+
+SEC("tp_btf/xdp_devmap_xmit")
+int BPF_PROG(tp_xdp_devmap_xmit_multi, const struct net_device
+	     *from_dev, const struct net_device *to_dev, int sent, int drops,
+	     int err)
+{
+	return randmap(from_dev->ifindex, from_dev);
+}
+
+SEC("fentry/eth_type_trans")
+int BPF_PROG(fentry_eth_type_trans, struct sk_buff *skb,
+	     struct net_device *dev, unsigned short protocol)
+{
+	return randmap(dev->ifindex + skb->len, dev);
+}
+
+SEC("fexit/eth_type_trans")
+int BPF_PROG(fexit_eth_type_trans, struct sk_buff *skb,
+	     struct net_device *dev, unsigned short protocol)
+{
+	return randmap(dev->ifindex + skb->len, dev);
+}
+
+volatile const int never;
+
+struct __sk_bUfF /* it will not exist in vmlinux */ {
+	int len;
+} __attribute__((preserve_access_index));
+
+struct bpf_testmod_test_read_ctx /* it exists in bpf_testmod */ {
+	size_t len;
+} __attribute__((preserve_access_index));
+
+SEC("tc")
+int balancer_ingress(struct __sk_buff *ctx)
+{
+	void *data_end = (void *)(long)ctx->data_end;
+	void *data = (void *)(long)ctx->data;
+	void *ptr;
+	int ret = 0, nh_off, i = 0;
+
+	nh_off = 14;
+
+	/* pragma unroll doesn't work on large loops */
+#define C do { \
+	ptr = data + i; \
+	if (ptr + nh_off > data_end) \
+		break; \
+	ctx->tc_index = jhash(ptr, nh_off, ctx->cb[0] + i++); \
+	if (never) { \
+		/* below is a dead code with unresolvable CO-RE relo */ \
+		i += ((struct __sk_bUfF *)ctx)->len; \
+		/* this CO-RE relo may or may not resolve
+		 * depending on whether bpf_testmod is loaded.
+		 */ \
+		i += ((struct bpf_testmod_test_read_ctx *)ctx)->len; \
+	} \
+	} while (0);
+#define C30 C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;
+	C30;C30;C30; /* 90 calls */
+	return 0;
+}
+
+char LICENSE[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 3268f0316af629474ec4fa8d9b4e6f618cb96794 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 1 Dec 2021 10:10:39 -0800
Subject: selftests/bpf: Revert CO-RE removal in test_ksyms_weak.

The commit 087cba799ced ("selftests/bpf: Add weak/typeless ksym test for light skeleton")
added test_ksyms_weak to light skeleton testing, but remove CO-RE access.
Revert that part of commit, since light skeleton can use CO-RE in the kernel.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211201181040.23337-17-alexei.starovoitov@gmail.com
---
 tools/testing/selftests/bpf/progs/test_ksyms_weak.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/test_ksyms_weak.c b/tools/testing/selftests/bpf/progs/test_ksyms_weak.c
index 8eadbd4caf7a..5f8379aadb29 100644
--- a/tools/testing/selftests/bpf/progs/test_ksyms_weak.c
+++ b/tools/testing/selftests/bpf/progs/test_ksyms_weak.c
@@ -38,7 +38,7 @@ int pass_handler(const void *ctx)
 	/* tests existing symbols. */
 	rq = (struct rq *)bpf_per_cpu_ptr(&runqueues, 0);
 	if (rq)
-		out__existing_typed = 0;
+		out__existing_typed = rq->cpu;
 	out__existing_typeless = (__u64)&bpf_prog_active;
 
 	/* tests non-existent symbols. */
-- 
cgit v1.2.3


From 098dc5335a2083223c80d058ab4d23f6ce120b97 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 1 Dec 2021 10:10:40 -0800
Subject: selftests/bpf: Add CO-RE relocations to verifier scale test.

Add 182 CO-RE relocations to verifier scale test.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211201181040.23337-18-alexei.starovoitov@gmail.com
---
 tools/testing/selftests/bpf/progs/test_verif_scale2.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/test_verif_scale2.c b/tools/testing/selftests/bpf/progs/test_verif_scale2.c
index f024154c7be7..f90ffcafd1e8 100644
--- a/tools/testing/selftests/bpf/progs/test_verif_scale2.c
+++ b/tools/testing/selftests/bpf/progs/test_verif_scale2.c
@@ -1,11 +1,11 @@
 // SPDX-License-Identifier: GPL-2.0
 // Copyright (c) 2019 Facebook
-#include <linux/bpf.h>
+#include "vmlinux.h"
 #include <bpf/bpf_helpers.h>
 #define ATTR __always_inline
 #include "test_jhash.h"
 
-SEC("scale90_inline")
+SEC("tc")
 int balancer_ingress(struct __sk_buff *ctx)
 {
 	void *data_end = (void *)(long)ctx->data_end;
-- 
cgit v1.2.3


From eee9a6df0eed6481d5448a55b218a45868b41b5b Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Wed, 1 Dec 2021 14:51:02 +0000
Subject: selftests/bpf: Build testing_helpers.o out of tree

Add $(OUTPUT) prefix to testing_helpers.o, so it can be built out of
tree when necessary. At the moment, in addition to being built in-tree
even when out-of-tree is required, testing_helpers.o is not built with
the right recipe when cross-building.

For consistency the other helpers, cgroup_helpers and trace_helpers, can
also be passed as objects instead of source. Use *_HELPERS variable to
keep the Makefile readable.

Fixes: f87c1930ac29 ("selftests/bpf: Merge test_stub.c into testing_helpers.c")
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211201145101.823159-1-jean-philippe@linaro.org
---
 tools/testing/selftests/bpf/Makefile | 40 ++++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 18 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 8981369b071b..cd73dccaeb09 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -192,22 +192,26 @@ TEST_GEN_PROGS_EXTENDED += $(DEFAULT_BPFTOOL)
 
 $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): $(BPFOBJ)
 
-$(OUTPUT)/test_dev_cgroup: cgroup_helpers.c testing_helpers.o
-$(OUTPUT)/test_skb_cgroup_id_user: cgroup_helpers.c testing_helpers.o
-$(OUTPUT)/test_sock: cgroup_helpers.c testing_helpers.o
-$(OUTPUT)/test_sock_addr: cgroup_helpers.c testing_helpers.o
-$(OUTPUT)/test_sockmap: cgroup_helpers.c testing_helpers.o
-$(OUTPUT)/test_tcpnotify_user: cgroup_helpers.c trace_helpers.c testing_helpers.o
-$(OUTPUT)/get_cgroup_id_user: cgroup_helpers.c testing_helpers.o
-$(OUTPUT)/test_cgroup_storage: cgroup_helpers.c testing_helpers.o
-$(OUTPUT)/test_sock_fields: cgroup_helpers.c testing_helpers.o
-$(OUTPUT)/test_sysctl: cgroup_helpers.c testing_helpers.o
-$(OUTPUT)/test_tag: testing_helpers.o
-$(OUTPUT)/test_lirc_mode2_user: testing_helpers.o
-$(OUTPUT)/xdping: testing_helpers.o
-$(OUTPUT)/flow_dissector_load: testing_helpers.o
-$(OUTPUT)/test_maps: testing_helpers.o
-$(OUTPUT)/test_verifier: testing_helpers.o
+CGROUP_HELPERS	:= $(OUTPUT)/cgroup_helpers.o
+TESTING_HELPERS	:= $(OUTPUT)/testing_helpers.o
+TRACE_HELPERS	:= $(OUTPUT)/trace_helpers.o
+
+$(OUTPUT)/test_dev_cgroup: $(CGROUP_HELPERS) $(TESTING_HELPERS)
+$(OUTPUT)/test_skb_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS)
+$(OUTPUT)/test_sock: $(CGROUP_HELPERS) $(TESTING_HELPERS)
+$(OUTPUT)/test_sock_addr: $(CGROUP_HELPERS) $(TESTING_HELPERS)
+$(OUTPUT)/test_sockmap: $(CGROUP_HELPERS) $(TESTING_HELPERS)
+$(OUTPUT)/test_tcpnotify_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(TRACE_HELPERS)
+$(OUTPUT)/get_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS)
+$(OUTPUT)/test_cgroup_storage: $(CGROUP_HELPERS) $(TESTING_HELPERS)
+$(OUTPUT)/test_sock_fields: $(CGROUP_HELPERS) $(TESTING_HELPERS)
+$(OUTPUT)/test_sysctl: $(CGROUP_HELPERS) $(TESTING_HELPERS)
+$(OUTPUT)/test_tag: $(TESTING_HELPERS)
+$(OUTPUT)/test_lirc_mode2_user: $(TESTING_HELPERS)
+$(OUTPUT)/xdping: $(TESTING_HELPERS)
+$(OUTPUT)/flow_dissector_load: $(TESTING_HELPERS)
+$(OUTPUT)/test_maps: $(TESTING_HELPERS)
+$(OUTPUT)/test_verifier: $(TESTING_HELPERS)
 
 BPFTOOL ?= $(DEFAULT_BPFTOOL)
 $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile)    \
@@ -536,8 +540,8 @@ $(OUTPUT)/bench_bpf_loop.o: $(OUTPUT)/bpf_loop_bench.skel.h
 $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ)
 $(OUTPUT)/bench: LDLIBS += -lm
 $(OUTPUT)/bench: $(OUTPUT)/bench.o \
-		 $(OUTPUT)/testing_helpers.o \
-		 $(OUTPUT)/trace_helpers.o \
+		 $(TESTING_HELPERS) \
+		 $(TRACE_HELPERS) \
 		 $(OUTPUT)/bench_count.o \
 		 $(OUTPUT)/bench_rename.o \
 		 $(OUTPUT)/bench_trigger.o \
-- 
cgit v1.2.3


From 8b4ff5f8bb126fa8ee6918f4854748277609cf68 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 30 Nov 2021 16:50:30 -0800
Subject: selftests/bpf: Update test names for xchg and cmpxchg

The test_cmpxchg() and test_xchg() functions say "test_run add".
Therefore, make them say "test_run cmpxchg" and "test_run xchg",
respectively.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211201005030.GA3071525@paulmck-ThinkPad-P17-Gen-1
---
 tools/testing/selftests/bpf/prog_tests/atomics.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/atomics.c b/tools/testing/selftests/bpf/prog_tests/atomics.c
index 0f9525293881..86b7d5d84eec 100644
--- a/tools/testing/selftests/bpf/prog_tests/atomics.c
+++ b/tools/testing/selftests/bpf/prog_tests/atomics.c
@@ -167,7 +167,7 @@ static void test_cmpxchg(struct atomics_lskel *skel)
 	prog_fd = skel->progs.cmpxchg.prog_fd;
 	err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
 				NULL, NULL, &retval, &duration);
-	if (CHECK(err || retval, "test_run add",
+	if (CHECK(err || retval, "test_run cmpxchg",
 		  "err %d errno %d retval %d duration %d\n", err, errno, retval, duration))
 		goto cleanup;
 
@@ -196,7 +196,7 @@ static void test_xchg(struct atomics_lskel *skel)
 	prog_fd = skel->progs.xchg.prog_fd;
 	err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
 				NULL, NULL, &retval, &duration);
-	if (CHECK(err || retval, "test_run add",
+	if (CHECK(err || retval, "test_run xchg",
 		  "err %d errno %d retval %d duration %d\n", err, errno, retval, duration))
 		goto cleanup;
 
-- 
cgit v1.2.3


From 3345193f6f3cc24791c245d4ba2c38502f1cf684 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Mon, 22 Nov 2021 20:17:42 +0530
Subject: tools/resolve_btfids: Skip unresolved symbol warning for empty BTF
 sets

resolve_btfids prints a warning when it finds an unresolved symbol,
(id == 0) in id_patch. This can be the case for BTF sets that are empty
(due to disabled config options), hence printing warnings for certain
builds, most recently seen in [0].

The reason behind this is because id->cnt aliases id->id in btf_id
struct, leading to empty set showing up as ID 0 when we get to id_patch,
which triggers the warning. Since sets are an exception here, accomodate
by reusing hole in btf_id for bool is_set member, setting it to true for
BTF set when setting id->cnt, and use that to skip extraneous warning.

  [0]: https://lore.kernel.org/all/1b99ae14-abb4-d18f-cc6a-d7e523b25542@gmail.com

Before:

; ./tools/bpf/resolve_btfids/resolve_btfids -v -b vmlinux net/ipv4/tcp_cubic.ko
adding symbol tcp_cubic_kfunc_ids
WARN: resolve_btfids: unresolved symbol tcp_cubic_kfunc_ids
patching addr     0: ID       0 [tcp_cubic_kfunc_ids]
sorting  addr     4: cnt      0 [tcp_cubic_kfunc_ids]
update ok for net/ipv4/tcp_cubic.ko

After:

; ./tools/bpf/resolve_btfids/resolve_btfids -v -b vmlinux net/ipv4/tcp_cubic.ko
adding symbol tcp_cubic_kfunc_ids
patching addr     0: ID       0 [tcp_cubic_kfunc_ids]
sorting  addr     4: cnt      0 [tcp_cubic_kfunc_ids]
update ok for net/ipv4/tcp_cubic.ko

Fixes: 0e32dfc80bae ("bpf: Enable TCP congestion control kfunc from modules")
Reported-by: Pavel Skripkin <paskripkin@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20211122144742.477787-4-memxor@gmail.com
---
 tools/bpf/resolve_btfids/main.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c
index a59cb0ee609c..73409e27be01 100644
--- a/tools/bpf/resolve_btfids/main.c
+++ b/tools/bpf/resolve_btfids/main.c
@@ -83,6 +83,7 @@ struct btf_id {
 		int	 cnt;
 	};
 	int		 addr_cnt;
+	bool		 is_set;
 	Elf64_Addr	 addr[ADDR_CNT];
 };
 
@@ -451,8 +452,10 @@ static int symbols_collect(struct object *obj)
 			 * in symbol's size, together with 'cnt' field hence
 			 * that - 1.
 			 */
-			if (id)
+			if (id) {
 				id->cnt = sym.st_size / sizeof(int) - 1;
+				id->is_set = true;
+			}
 		} else {
 			pr_err("FAILED unsupported prefix %s\n", prefix);
 			return -1;
@@ -568,9 +571,8 @@ static int id_patch(struct object *obj, struct btf_id *id)
 	int *ptr = data->d_buf;
 	int i;
 
-	if (!id->id) {
+	if (!id->id && !id->is_set)
 		pr_err("WARN: resolve_btfids: unresolved symbol %s\n", id->name);
-	}
 
 	for (i = 0; i < id->addr_cnt; i++) {
 		unsigned long addr = id->addr[i];
-- 
cgit v1.2.3


From 74d9807023573ba2d82ec3f505f6aa0c7076918c Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Dec 2021 15:28:16 -0800
Subject: libbpf: Use __u32 fields in bpf_map_create_opts

Corresponding Linux UAPI struct uses __u32, not int, so keep it
consistent.

Fixes: 992c4225419a ("libbpf: Unify low-level map creation APIs w/ new bpf_map_create()")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211201232824.3166325-2-andrii@kernel.org
---
 tools/lib/bpf/bpf.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 70b6f44fc8b0..f79e5fbcf1c1 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -43,12 +43,12 @@ struct bpf_map_create_opts {
 	__u32 btf_value_type_id;
 	__u32 btf_vmlinux_value_type_id;
 
-	int inner_map_fd;
-	int map_flags;
+	__u32 inner_map_fd;
+	__u32 map_flags;
 	__u64 map_extra;
 
-	int numa_node;
-	int map_ifindex;
+	__u32 numa_node;
+	__u32 map_ifindex;
 };
 #define bpf_map_create_opts__last_field map_ifindex
 
-- 
cgit v1.2.3


From dbdd2c7f8cec2d09ae0e1bd707ae6050fa1c105f Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Dec 2021 15:28:17 -0800
Subject: libbpf: Add API to get/set log_level at per-program level

Add bpf_program__set_log_level() and bpf_program__log_level() to fetch
and adjust log_level sent during BPF_PROG_LOAD command. This allows to
selectively request more or less verbose output in BPF verifier log.

Also bump libbpf version to 0.7 and make these APIs the first in v0.7.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211201232824.3166325-3-andrii@kernel.org
---
 tools/lib/bpf/libbpf.c         | 14 ++++++++++++++
 tools/lib/bpf/libbpf.h         |  2 ++
 tools/lib/bpf/libbpf.map       |  6 ++++++
 tools/lib/bpf/libbpf_version.h |  2 +-
 4 files changed, 23 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 1341ce539662..de260c94e418 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -8475,6 +8475,20 @@ int bpf_program__set_flags(struct bpf_program *prog, __u32 flags)
 	return 0;
 }
 
+__u32 bpf_program__log_level(const struct bpf_program *prog)
+{
+	return prog->log_level;
+}
+
+int bpf_program__set_log_level(struct bpf_program *prog, __u32 log_level)
+{
+	if (prog->obj->loaded)
+		return libbpf_err(-EBUSY);
+
+	prog->log_level = log_level;
+	return 0;
+}
+
 #define SEC_DEF(sec_pfx, ptype, atype, flags, ...) {			    \
 	.sec = sec_pfx,							    \
 	.prog_type = BPF_PROG_TYPE_##ptype,				    \
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index d02139fec4ac..148fa85bab33 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -499,6 +499,8 @@ bpf_program__set_expected_attach_type(struct bpf_program *prog,
 
 LIBBPF_API __u32 bpf_program__flags(const struct bpf_program *prog);
 LIBBPF_API int bpf_program__set_flags(struct bpf_program *prog, __u32 flags);
+LIBBPF_API __u32 bpf_program__log_level(const struct bpf_program *prog);
+LIBBPF_API int bpf_program__set_log_level(struct bpf_program *prog, __u32 log_level);
 
 LIBBPF_API int
 bpf_program__set_attach_target(struct bpf_program *prog, int attach_prog_fd,
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 623002b83b2b..715df3a27389 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -419,3 +419,9 @@ LIBBPF_0.6.0 {
 		perf_buffer__new_raw;
 		perf_buffer__new_raw_deprecated;
 } LIBBPF_0.5.0;
+
+LIBBPF_0.7.0 {
+	global:
+		bpf_program__log_level;
+		bpf_program__set_log_level;
+};
diff --git a/tools/lib/bpf/libbpf_version.h b/tools/lib/bpf/libbpf_version.h
index dd56d76f291c..0fefefc3500b 100644
--- a/tools/lib/bpf/libbpf_version.h
+++ b/tools/lib/bpf/libbpf_version.h
@@ -4,6 +4,6 @@
 #define __LIBBPF_VERSION_H
 
 #define LIBBPF_MAJOR_VERSION 0
-#define LIBBPF_MINOR_VERSION 6
+#define LIBBPF_MINOR_VERSION 7
 
 #endif /* __LIBBPF_VERSION_H */
-- 
cgit v1.2.3


From a15d408b839af421fba0a2ff6df193c13ef753d4 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Dec 2021 15:28:18 -0800
Subject: bpftool: Migrate off of deprecated bpf_create_map_xattr() API

Switch to bpf_map_create() API instead.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211201232824.3166325-4-andrii@kernel.org
---
 tools/bpf/bpftool/map.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index 25b258804f11..cc530a229812 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -1261,7 +1261,10 @@ static int do_pin(int argc, char **argv)
 
 static int do_create(int argc, char **argv)
 {
-	struct bpf_create_map_attr attr = { NULL, };
+	LIBBPF_OPTS(bpf_map_create_opts, attr);
+	enum bpf_map_type map_type = BPF_MAP_TYPE_UNSPEC;
+	__u32 key_size = 0, value_size = 0, max_entries = 0;
+	const char *map_name = NULL;
 	const char *pinfile;
 	int err = -1, fd;
 
@@ -1276,30 +1279,30 @@ static int do_create(int argc, char **argv)
 		if (is_prefix(*argv, "type")) {
 			NEXT_ARG();
 
-			if (attr.map_type) {
+			if (map_type) {
 				p_err("map type already specified");
 				goto exit;
 			}
 
-			attr.map_type = map_type_from_str(*argv);
-			if ((int)attr.map_type < 0) {
+			map_type = map_type_from_str(*argv);
+			if ((int)map_type < 0) {
 				p_err("unrecognized map type: %s", *argv);
 				goto exit;
 			}
 			NEXT_ARG();
 		} else if (is_prefix(*argv, "name")) {
 			NEXT_ARG();
-			attr.name = GET_ARG();
+			map_name = GET_ARG();
 		} else if (is_prefix(*argv, "key")) {
-			if (parse_u32_arg(&argc, &argv, &attr.key_size,
+			if (parse_u32_arg(&argc, &argv, &key_size,
 					  "key size"))
 				goto exit;
 		} else if (is_prefix(*argv, "value")) {
-			if (parse_u32_arg(&argc, &argv, &attr.value_size,
+			if (parse_u32_arg(&argc, &argv, &value_size,
 					  "value size"))
 				goto exit;
 		} else if (is_prefix(*argv, "entries")) {
-			if (parse_u32_arg(&argc, &argv, &attr.max_entries,
+			if (parse_u32_arg(&argc, &argv, &max_entries,
 					  "max entries"))
 				goto exit;
 		} else if (is_prefix(*argv, "flags")) {
@@ -1340,14 +1343,14 @@ static int do_create(int argc, char **argv)
 		}
 	}
 
-	if (!attr.name) {
+	if (!map_name) {
 		p_err("map name not specified");
 		goto exit;
 	}
 
 	set_max_rlimit();
 
-	fd = bpf_create_map_xattr(&attr);
+	fd = bpf_map_create(map_type, map_name, key_size, value_size, max_entries, &attr);
 	if (fd < 0) {
 		p_err("map create failed: %s", strerror(errno));
 		goto exit;
-- 
cgit v1.2.3


From 045b233a29a2ea3a168296f000cd5b1c08c4a2f7 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Dec 2021 15:28:19 -0800
Subject: selftests/bpf: Remove recently reintroduced legacy btf__dedup() use

We've added one extra patch that added back the use of legacy
btf__dedup() variant. Clean that up.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211201232824.3166325-5-andrii@kernel.org
---
 tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c b/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c
index 94ff9757557a..878a864dae3b 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c
@@ -364,7 +364,7 @@ static void test_split_dup_struct_in_cu()
 			"\t'f2' type_id=1 bits_offset=32");
 
 	/* ..dedup them... */
-	err = btf__dedup(btf1, NULL, NULL);
+	err = btf__dedup(btf1, NULL);
 	if (!ASSERT_OK(err, "btf_dedup"))
 		goto cleanup;
 
@@ -405,7 +405,7 @@ static void test_split_dup_struct_in_cu()
 			"\t'f1' type_id=4 bits_offset=0\n"
 			"\t'f2' type_id=4 bits_offset=32");
 
-	err = btf__dedup(btf2, NULL, NULL);
+	err = btf__dedup(btf2, NULL);
 	if (!ASSERT_OK(err, "btf_dedup"))
 		goto cleanup;
 
-- 
cgit v1.2.3


From 00872de6e1b004377f6036f95db43e2145606eb2 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Dec 2021 15:28:20 -0800
Subject: selftests/bpf: Mute xdpxceiver.c's deprecation warnings

xdpxceiver.c is using AF_XDP APIs that are deprecated starting from
libbpf 0.7. Until we migrate the test to libxdp or solve this issue in
some other way, mute deprecation warnings within xdpxceiver.c.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211201232824.3166325-6-andrii@kernel.org
---
 tools/testing/selftests/bpf/xdpxceiver.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c
index 040164c7efc1..0a5d23da486d 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.c
+++ b/tools/testing/selftests/bpf/xdpxceiver.c
@@ -100,6 +100,12 @@
 #include "xdpxceiver.h"
 #include "../kselftest.h"
 
+/* AF_XDP APIs were moved into libxdp and marked as deprecated in libbpf.
+ * Until xdpxceiver is either moved or re-writed into libxdp, suppress
+ * deprecation warnings in this file
+ */
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
 static const char *MAC1 = "\x00\x0A\x56\x9E\xEE\x62";
 static const char *MAC2 = "\x00\x0A\x56\x9E\xEE\x61";
 static const char *IP1 = "192.168.100.162";
-- 
cgit v1.2.3


From 186d1a86003ddcf0ec9e85e17ece868663106639 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Dec 2021 15:28:21 -0800
Subject: selftests/bpf: Remove all the uses of deprecated
 bpf_prog_load_xattr()

Migrate all the selftests that were still using bpf_prog_load_xattr().
Few are converted to skeleton, others will use bpf_object__open_file()
API.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211201232824.3166325-7-andrii@kernel.org
---
 .../selftests/bpf/prog_tests/bpf_verif_scale.c     | 30 +++++++----
 .../selftests/bpf/prog_tests/connect_force_port.c  | 17 ++++---
 tools/testing/selftests/bpf/prog_tests/kfree_skb.c | 58 +++++++---------------
 .../selftests/bpf/prog_tests/sockopt_inherit.c     | 12 ++---
 .../selftests/bpf/prog_tests/sockopt_multi.c       | 12 ++---
 tools/testing/selftests/bpf/prog_tests/tcp_rtt.c   | 21 +++-----
 .../selftests/bpf/prog_tests/test_global_funcs.c   | 28 ++++++++---
 tools/testing/selftests/bpf/test_sock_addr.c       | 33 ++++++++----
 tools/testing/selftests/bpf/xdp_redirect_multi.c   | 15 +++---
 9 files changed, 119 insertions(+), 107 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c
index 1fb16f8dad56..ff6cce9fef06 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c
@@ -19,16 +19,28 @@ extern int extra_prog_load_log_flags;
 
 static int check_load(const char *file, enum bpf_prog_type type)
 {
-	struct bpf_prog_load_attr attr;
 	struct bpf_object *obj = NULL;
-	int err, prog_fd;
-
-	memset(&attr, 0, sizeof(struct bpf_prog_load_attr));
-	attr.file = file;
-	attr.prog_type = type;
-	attr.log_level = 4 | extra_prog_load_log_flags;
-	attr.prog_flags = BPF_F_TEST_RND_HI32;
-	err = bpf_prog_load_xattr(&attr, &obj, &prog_fd);
+	struct bpf_program *prog;
+	int err;
+
+	obj = bpf_object__open_file(file, NULL);
+	err = libbpf_get_error(obj);
+	if (err)
+		return err;
+
+	prog = bpf_object__next_program(obj, NULL);
+	if (!prog) {
+		err = -ENOENT;
+		goto err_out;
+	}
+
+	bpf_program__set_type(prog, type);
+	bpf_program__set_flags(prog, BPF_F_TEST_RND_HI32);
+	bpf_program__set_log_level(prog, 4 | extra_prog_load_log_flags);
+
+	err = bpf_object__load(obj);
+
+err_out:
 	bpf_object__close(obj);
 	return err;
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/connect_force_port.c b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c
index 9229db2f5ca5..ca574e1e30e6 100644
--- a/tools/testing/selftests/bpf/prog_tests/connect_force_port.c
+++ b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c
@@ -51,19 +51,20 @@ static int run_test(int cgroup_fd, int server_fd, int family, int type)
 	bool v4 = family == AF_INET;
 	__u16 expected_local_port = v4 ? 22222 : 22223;
 	__u16 expected_peer_port = 60000;
-	struct bpf_prog_load_attr attr = {
-		.file = v4 ? "./connect_force_port4.o" :
-			     "./connect_force_port6.o",
-	};
 	struct bpf_program *prog;
 	struct bpf_object *obj;
-	int xlate_fd, fd, err;
+	const char *obj_file = v4 ? "connect_force_port4.o" : "connect_force_port6.o";
+	int fd, err;
 	__u32 duration = 0;
 
-	err = bpf_prog_load_xattr(&attr, &obj, &xlate_fd);
-	if (err) {
-		log_err("Failed to load BPF object");
+	obj = bpf_object__open_file(obj_file, NULL);
+	if (!ASSERT_OK_PTR(obj, "bpf_obj_open"))
 		return -1;
+
+	err = bpf_object__load(obj);
+	if (!ASSERT_OK(err, "bpf_obj_load")) {
+		err = -EIO;
+		goto close_bpf_object;
 	}
 
 	prog = bpf_object__find_program_by_title(obj, v4 ?
diff --git a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c
index 2a49f8fcde06..ce10d2fc3a6c 100644
--- a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c
+++ b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <test_progs.h>
 #include <network_helpers.h>
+#include "kfree_skb.skel.h"
 
 struct meta {
 	int ifindex;
@@ -58,16 +59,11 @@ void serial_test_kfree_skb(void)
 		.ctx_in = &skb,
 		.ctx_size_in = sizeof(skb),
 	};
-	struct bpf_prog_load_attr attr = {
-		.file = "./kfree_skb.o",
-	};
-
-	struct bpf_link *link = NULL, *link_fentry = NULL, *link_fexit = NULL;
-	struct bpf_map *perf_buf_map, *global_data;
-	struct bpf_program *prog, *fentry, *fexit;
-	struct bpf_object *obj, *obj2 = NULL;
+	struct kfree_skb *skel = NULL;
+	struct bpf_link *link;
+	struct bpf_object *obj;
 	struct perf_buffer *pb = NULL;
-	int err, kfree_skb_fd;
+	int err;
 	bool passed = false;
 	__u32 duration = 0;
 	const int zero = 0;
@@ -78,40 +74,27 @@ void serial_test_kfree_skb(void)
 	if (CHECK(err, "prog_load sched cls", "err %d errno %d\n", err, errno))
 		return;
 
-	err = bpf_prog_load_xattr(&attr, &obj2, &kfree_skb_fd);
-	if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
-		goto close_prog;
-
-	prog = bpf_object__find_program_by_title(obj2, "tp_btf/kfree_skb");
-	if (CHECK(!prog, "find_prog", "prog kfree_skb not found\n"))
-		goto close_prog;
-	fentry = bpf_object__find_program_by_title(obj2, "fentry/eth_type_trans");
-	if (CHECK(!fentry, "find_prog", "prog eth_type_trans not found\n"))
-		goto close_prog;
-	fexit = bpf_object__find_program_by_title(obj2, "fexit/eth_type_trans");
-	if (CHECK(!fexit, "find_prog", "prog eth_type_trans not found\n"))
-		goto close_prog;
-
-	global_data = bpf_object__find_map_by_name(obj2, ".bss");
-	if (CHECK(!global_data, "find global data", "not found\n"))
+	skel = kfree_skb__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "kfree_skb_skel"))
 		goto close_prog;
 
-	link = bpf_program__attach_raw_tracepoint(prog, NULL);
+	link = bpf_program__attach_raw_tracepoint(skel->progs.trace_kfree_skb, NULL);
 	if (!ASSERT_OK_PTR(link, "attach_raw_tp"))
 		goto close_prog;
-	link_fentry = bpf_program__attach_trace(fentry);
-	if (!ASSERT_OK_PTR(link_fentry, "attach fentry"))
-		goto close_prog;
-	link_fexit = bpf_program__attach_trace(fexit);
-	if (!ASSERT_OK_PTR(link_fexit, "attach fexit"))
+	skel->links.trace_kfree_skb = link;
+
+	link = bpf_program__attach_trace(skel->progs.fentry_eth_type_trans);
+	if (!ASSERT_OK_PTR(link, "attach fentry"))
 		goto close_prog;
+	skel->links.fentry_eth_type_trans = link;
 
-	perf_buf_map = bpf_object__find_map_by_name(obj2, "perf_buf_map");
-	if (CHECK(!perf_buf_map, "find_perf_buf_map", "not found\n"))
+	link = bpf_program__attach_trace(skel->progs.fexit_eth_type_trans);
+	if (!ASSERT_OK_PTR(link, "attach fexit"))
 		goto close_prog;
+	skel->links.fexit_eth_type_trans = link;
 
 	/* set up perf buffer */
-	pb = perf_buffer__new(bpf_map__fd(perf_buf_map), 1,
+	pb = perf_buffer__new(bpf_map__fd(skel->maps.perf_buf_map), 1,
 			      on_sample, NULL, &passed, NULL);
 	if (!ASSERT_OK_PTR(pb, "perf_buf__new"))
 		goto close_prog;
@@ -133,7 +116,7 @@ void serial_test_kfree_skb(void)
 	 */
 	ASSERT_TRUE(passed, "passed");
 
-	err = bpf_map_lookup_elem(bpf_map__fd(global_data), &zero, test_ok);
+	err = bpf_map_lookup_elem(bpf_map__fd(skel->maps.bss), &zero, test_ok);
 	if (CHECK(err, "get_result",
 		  "failed to get output data: %d\n", err))
 		goto close_prog;
@@ -141,9 +124,6 @@ void serial_test_kfree_skb(void)
 	CHECK_FAIL(!test_ok[0] || !test_ok[1]);
 close_prog:
 	perf_buffer__free(pb);
-	bpf_link__destroy(link);
-	bpf_link__destroy(link_fentry);
-	bpf_link__destroy(link_fexit);
 	bpf_object__close(obj);
-	bpf_object__close(obj2);
+	kfree_skb__destroy(skel);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c
index 86f97681ad89..6a953f4adfdc 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c
@@ -167,20 +167,20 @@ static int prog_attach(struct bpf_object *obj, int cgroup_fd, const char *title)
 
 static void run_test(int cgroup_fd)
 {
-	struct bpf_prog_load_attr attr = {
-		.file = "./sockopt_inherit.o",
-	};
 	int server_fd = -1, client_fd;
 	struct bpf_object *obj;
 	void *server_err;
 	pthread_t tid;
-	int ignored;
 	int err;
 
-	err = bpf_prog_load_xattr(&attr, &obj, &ignored);
-	if (CHECK_FAIL(err))
+	obj = bpf_object__open_file("sockopt_inherit.o", NULL);
+	if (!ASSERT_OK_PTR(obj, "obj_open"))
 		return;
 
+	err = bpf_object__load(obj);
+	if (!ASSERT_OK(err, "obj_load"))
+		goto close_bpf_object;
+
 	err = prog_attach(obj, cgroup_fd, "cgroup/getsockopt");
 	if (CHECK_FAIL(err))
 		goto close_bpf_object;
diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c b/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c
index bc34f7773444..abce12ddcc37 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c
@@ -297,14 +297,10 @@ detach:
 
 void test_sockopt_multi(void)
 {
-	struct bpf_prog_load_attr attr = {
-		.file = "./sockopt_multi.o",
-	};
 	int cg_parent = -1, cg_child = -1;
 	struct bpf_object *obj = NULL;
 	int sock_fd = -1;
 	int err = -1;
-	int ignored;
 
 	cg_parent = test__join_cgroup("/parent");
 	if (CHECK_FAIL(cg_parent < 0))
@@ -314,8 +310,12 @@ void test_sockopt_multi(void)
 	if (CHECK_FAIL(cg_child < 0))
 		goto out;
 
-	err = bpf_prog_load_xattr(&attr, &obj, &ignored);
-	if (CHECK_FAIL(err))
+	obj = bpf_object__open_file("sockopt_multi.o", NULL);
+	if (!ASSERT_OK_PTR(obj, "obj_load"))
+		goto out;
+
+	err = bpf_object__load(obj);
+	if (!ASSERT_OK(err, "obj_load"))
 		goto out;
 
 	sock_fd = socket(AF_INET, SOCK_STREAM, 0);
diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c
index 265b4fe33ec3..96ff2c20af81 100644
--- a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c
+++ b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c
@@ -2,6 +2,7 @@
 #include <test_progs.h>
 #include "cgroup_helpers.h"
 #include "network_helpers.h"
+#include "tcp_rtt.skel.h"
 
 struct tcp_rtt_storage {
 	__u32 invoked;
@@ -91,26 +92,18 @@ static int verify_sk(int map_fd, int client_fd, const char *msg, __u32 invoked,
 
 static int run_test(int cgroup_fd, int server_fd)
 {
-	struct bpf_prog_load_attr attr = {
-		.prog_type = BPF_PROG_TYPE_SOCK_OPS,
-		.file = "./tcp_rtt.o",
-		.expected_attach_type = BPF_CGROUP_SOCK_OPS,
-	};
-	struct bpf_object *obj;
-	struct bpf_map *map;
+	struct tcp_rtt *skel;
 	int client_fd;
 	int prog_fd;
 	int map_fd;
 	int err;
 
-	err = bpf_prog_load_xattr(&attr, &obj, &prog_fd);
-	if (err) {
-		log_err("Failed to load BPF object");
+	skel = tcp_rtt__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel_open_load"))
 		return -1;
-	}
 
-	map = bpf_object__next_map(obj, NULL);
-	map_fd = bpf_map__fd(map);
+	map_fd = bpf_map__fd(skel->maps.socket_storage_map);
+	prog_fd = bpf_program__fd(skel->progs._sockops);
 
 	err = bpf_prog_attach(prog_fd, cgroup_fd, BPF_CGROUP_SOCK_OPS, 0);
 	if (err) {
@@ -149,7 +142,7 @@ close_client_fd:
 	close(client_fd);
 
 close_bpf_object:
-	bpf_object__close(obj);
+	tcp_rtt__destroy(skel);
 	return err;
 }
 
diff --git a/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c
index 7e13129f593a..509e21d5cb9d 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c
@@ -30,17 +30,29 @@ extern int extra_prog_load_log_flags;
 
 static int check_load(const char *file)
 {
-	struct bpf_prog_load_attr attr;
 	struct bpf_object *obj = NULL;
-	int err, prog_fd;
+	struct bpf_program *prog;
+	int err;
 
-	memset(&attr, 0, sizeof(struct bpf_prog_load_attr));
-	attr.file = file;
-	attr.prog_type = BPF_PROG_TYPE_UNSPEC;
-	attr.log_level = extra_prog_load_log_flags;
-	attr.prog_flags = BPF_F_TEST_RND_HI32;
 	found = false;
-	err = bpf_prog_load_xattr(&attr, &obj, &prog_fd);
+
+	obj = bpf_object__open_file(file, NULL);
+	err = libbpf_get_error(obj);
+	if (err)
+		return err;
+
+	prog = bpf_object__next_program(obj, NULL);
+	if (!prog) {
+		err = -ENOENT;
+		goto err_out;
+	}
+
+	bpf_program__set_flags(prog, BPF_F_TEST_RND_HI32);
+	bpf_program__set_log_level(prog, extra_prog_load_log_flags);
+
+	err = bpf_object__load(obj);
+
+err_out:
 	bpf_object__close(obj);
 	return err;
 }
diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c
index 05c9e4944c01..f0c8d05ba6d1 100644
--- a/tools/testing/selftests/bpf/test_sock_addr.c
+++ b/tools/testing/selftests/bpf/test_sock_addr.c
@@ -663,23 +663,36 @@ static int load_insns(const struct sock_addr_test *test,
 
 static int load_path(const struct sock_addr_test *test, const char *path)
 {
-	struct bpf_prog_load_attr attr;
 	struct bpf_object *obj;
-	int prog_fd;
+	struct bpf_program *prog;
+	int err;
 
-	memset(&attr, 0, sizeof(struct bpf_prog_load_attr));
-	attr.file = path;
-	attr.prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
-	attr.expected_attach_type = test->expected_attach_type;
-	attr.prog_flags = BPF_F_TEST_RND_HI32;
+	obj = bpf_object__open_file(path, NULL);
+	err = libbpf_get_error(obj);
+	if (err) {
+		log_err(">>> Opening BPF object (%s) error.\n", path);
+		return -1;
+	}
+
+	prog = bpf_object__next_program(obj, NULL);
+	if (!prog)
+		goto err_out;
 
-	if (bpf_prog_load_xattr(&attr, &obj, &prog_fd)) {
+	bpf_program__set_type(prog, BPF_PROG_TYPE_CGROUP_SOCK_ADDR);
+	bpf_program__set_expected_attach_type(prog, test->expected_attach_type);
+	bpf_program__set_flags(prog, BPF_F_TEST_RND_HI32);
+
+	err = bpf_object__load(obj);
+	if (err) {
 		if (test->expected_result != LOAD_REJECT)
 			log_err(">>> Loading program (%s) error.\n", path);
-		return -1;
+		goto err_out;
 	}
 
-	return prog_fd;
+	return bpf_program__fd(prog);
+err_out:
+	bpf_object__close(obj);
+	return -1;
 }
 
 static int bind4_prog_load(const struct sock_addr_test *test)
diff --git a/tools/testing/selftests/bpf/xdp_redirect_multi.c b/tools/testing/selftests/bpf/xdp_redirect_multi.c
index f5ffba341c17..51c8224b4ccc 100644
--- a/tools/testing/selftests/bpf/xdp_redirect_multi.c
+++ b/tools/testing/selftests/bpf/xdp_redirect_multi.c
@@ -85,10 +85,7 @@ int main(int argc, char **argv)
 {
 	int prog_fd, group_all, mac_map;
 	struct bpf_program *ingress_prog, *egress_prog;
-	struct bpf_prog_load_attr prog_load_attr = {
-		.prog_type = BPF_PROG_TYPE_UNSPEC,
-	};
-	int i, ret, opt, egress_prog_fd = 0;
+	int i, err, ret, opt, egress_prog_fd = 0;
 	struct bpf_devmap_val devmap_val;
 	bool attach_egress_prog = false;
 	unsigned char mac_addr[6];
@@ -147,10 +144,14 @@ int main(int argc, char **argv)
 	printf("\n");
 
 	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
-	prog_load_attr.file = filename;
-
-	if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
+	obj = bpf_object__open_file(filename, NULL);
+	err = libbpf_get_error(obj);
+	if (err)
+		goto err_out;
+	err = bpf_object__load(obj);
+	if (err)
 		goto err_out;
+	prog_fd = bpf_program__fd(bpf_object__next_program(obj, NULL));
 
 	if (attach_egress_prog)
 		group_all = bpf_object__find_map_fd_by_name(obj, "map_egress");
-- 
cgit v1.2.3


From c93faaaf2f67ba5396840316651cdc7640d9fa9e Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Dec 2021 15:28:24 -0800
Subject: libbpf: Deprecate bpf_prog_load_xattr() API

bpf_prog_load_xattr() is high-level API that's named as a low-level
BPF_PROG_LOAD wrapper APIs, but it actually operates on struct
bpf_object. It's badly and confusingly misnamed as it will load all the
progs insige bpf_object, returning prog_fd of the very first BPF
program. It also has a bunch of ad-hoc things like log_level override,
map_ifindex auto-setting, etc. All this can be expressed more explicitly
and cleanly through existing libbpf APIs. This patch marks
bpf_prog_load_xattr() for deprecation in libbpf v0.8 ([0]).

  [0] Closes: https://github.com/libbpf/libbpf/issues/308

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211201232824.3166325-10-andrii@kernel.org
---
 tools/lib/bpf/libbpf.h        | 1 +
 tools/lib/bpf/libbpf_common.h | 5 +++++
 2 files changed, 6 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 148fa85bab33..c0d62dd37c5d 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -682,6 +682,7 @@ struct bpf_prog_load_attr {
 	int prog_flags;
 };
 
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_object__open() and bpf_object__load() instead")
 LIBBPF_API int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
 				   struct bpf_object **pobj, int *prog_fd);
 LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_object__open() and bpf_object__load() instead")
diff --git a/tools/lib/bpf/libbpf_common.h b/tools/lib/bpf/libbpf_common.h
index b21cefc9c3b6..000e37798ff2 100644
--- a/tools/lib/bpf/libbpf_common.h
+++ b/tools/lib/bpf/libbpf_common.h
@@ -40,6 +40,11 @@
 #else
 #define __LIBBPF_MARK_DEPRECATED_0_7(X)
 #endif
+#if __LIBBPF_CURRENT_VERSION_GEQ(0, 8)
+#define __LIBBPF_MARK_DEPRECATED_0_8(X) X
+#else
+#define __LIBBPF_MARK_DEPRECATED_0_8(X)
+#endif
 
 /* This set of internal macros allows to do "function overloading" based on
  * number of arguments provided by used in backwards-compatible way during the
-- 
cgit v1.2.3


From f6071e5e3961eeb5300bd0901c9e128598730ae3 Mon Sep 17 00:00:00 2001
From: Peilin Ye <peilin.ye@bytedance.com>
Date: Tue, 30 Nov 2021 16:47:20 -0800
Subject: selftests/fib_tests: Rework fib_rp_filter_test()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently rp_filter tests in fib_tests.sh:fib_rp_filter_test() are
failing.  ping sockets are bound to dummy1 using the "-I" option
(SO_BINDTODEVICE), but socket lookup is failing when receiving ping
replies, since the routing table thinks they belong to dummy0.

For example, suppose ping is using a SOCK_RAW socket for ICMP messages.
When receiving ping replies, in __raw_v4_lookup(), sk->sk_bound_dev_if
is 3 (dummy1), but dif (skb_rtable(skb)->rt_iif) says 2 (dummy0), so the
raw_sk_bound_dev_eq() check fails.  Similar things happen in
ping_lookup() for SOCK_DGRAM sockets.

These tests used to pass due to a bug [1] in iputils, where "ping -I"
actually did not bind ICMP message sockets to device.  The bug has been
fixed by iputils commit f455fee41c07 ("ping: also bind the ICMP socket
to the specific device") in 2016, which is why our rp_filter tests
started to fail.  See [2] .

Fixing the tests while keeping everything in one netns turns out to be
nontrivial.  Rework the tests and build the following topology:

 ┌─────────────────────────────┐    ┌─────────────────────────────┐
 │  network namespace 1 (ns1)  │    │  network namespace 2 (ns2)  │
 │                             │    │                             │
 │  ┌────┐     ┌─────┐         │    │  ┌─────┐            ┌────┐  │
 │  │ lo │<───>│veth1│<────────┼────┼─>│veth2│<──────────>│ lo │  │
 │  └────┘     ├─────┴──────┐  │    │  ├─────┴──────┐     └────┘  │
 │             │192.0.2.1/24│  │    │  │192.0.2.1/24│             │
 │             └────────────┘  │    │  └────────────┘             │
 └─────────────────────────────┘    └─────────────────────────────┘

Consider sending an ICMP_ECHO packet A in ns2.  Both source and
destination IP addresses are 192.0.2.1, and we use strict mode rp_filter
in both ns1 and ns2:

  1. A is routed to lo since its destination IP address is one of ns2's
     local addresses (veth2);
  2. A is redirected from lo's egress to veth2's egress using mirred;
  3. A arrives at veth1's ingress in ns1;
  4. A is redirected from veth1's ingress to lo's ingress, again, using
     mirred;
  5. In __fib_validate_source(), fib_info_nh_uses_dev() returns false,
     since A was received on lo, but reverse path lookup says veth1;
  6. However A is not dropped since we have relaxed this check for lo in
     commit 66f8209547cc ("fib: relax source validation check for loopback
     packets");

Making sure A is not dropped here in this corner case is the whole point
of having this test.

  7. As A reaches the ICMP layer, an ICMP_ECHOREPLY packet, B, is
     generated;
  8. Similarly, B is redirected from lo's egress to veth1's egress (in
     ns1), then redirected once again from veth2's ingress to lo's
     ingress (in ns2), using mirred.

Also test "ping 127.0.0.1" from ns2.  It does not trigger the relaxed
check in __fib_validate_source(), but just to make sure the topology
works with loopback addresses.

Tested with ping from iputils 20210722-41-gf9fb573:

$ ./fib_tests.sh -t rp_filter

IPv4 rp_filter tests
    TEST: rp_filter passes local packets		[ OK ]
    TEST: rp_filter passes loopback packets		[ OK ]

[1] https://github.com/iputils/iputils/issues/55
[2] https://github.com/iputils/iputils/commit/f455fee41c077d4b700a473b2f5b3487b8febc1d

Reported-by: Hangbin Liu <liuhangbin@gmail.com>
Fixes: adb701d6cfa4 ("selftests: add a test case for rp_filter")
Reviewed-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Peilin Ye <peilin.ye@bytedance.com>
Acked-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20211201004720.6357-1-yepeilin.cs@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/fib_tests.sh | 59 ++++++++++++++++++++++++++------
 1 file changed, 49 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh
index 5abe92d55b69..996af1ae3d3d 100755
--- a/tools/testing/selftests/net/fib_tests.sh
+++ b/tools/testing/selftests/net/fib_tests.sh
@@ -444,24 +444,63 @@ fib_rp_filter_test()
 	setup
 
 	set -e
+	ip netns add ns2
+	ip netns set ns2 auto
+
+	ip -netns ns2 link set dev lo up
+
+	$IP link add name veth1 type veth peer name veth2
+	$IP link set dev veth2 netns ns2
+	$IP address add 192.0.2.1/24 dev veth1
+	ip -netns ns2 address add 192.0.2.1/24 dev veth2
+	$IP link set dev veth1 up
+	ip -netns ns2 link set dev veth2 up
+
 	$IP link set dev lo address 52:54:00:6a:c7:5e
-	$IP link set dummy0 address 52:54:00:6a:c7:5e
-	$IP link add dummy1 type dummy
-	$IP link set dummy1 address 52:54:00:6a:c7:5e
-	$IP link set dev dummy1 up
+	$IP link set dev veth1 address 52:54:00:6a:c7:5e
+	ip -netns ns2 link set dev lo address 52:54:00:6a:c7:5e
+	ip -netns ns2 link set dev veth2 address 52:54:00:6a:c7:5e
+
+	# 1. (ns2) redirect lo's egress to veth2's egress
+	ip netns exec ns2 tc qdisc add dev lo parent root handle 1: fq_codel
+	ip netns exec ns2 tc filter add dev lo parent 1: protocol arp basic \
+		action mirred egress redirect dev veth2
+	ip netns exec ns2 tc filter add dev lo parent 1: protocol ip basic \
+		action mirred egress redirect dev veth2
+
+	# 2. (ns1) redirect veth1's ingress to lo's ingress
+	$NS_EXEC tc qdisc add dev veth1 ingress
+	$NS_EXEC tc filter add dev veth1 ingress protocol arp basic \
+		action mirred ingress redirect dev lo
+	$NS_EXEC tc filter add dev veth1 ingress protocol ip basic \
+		action mirred ingress redirect dev lo
+
+	# 3. (ns1) redirect lo's egress to veth1's egress
+	$NS_EXEC tc qdisc add dev lo parent root handle 1: fq_codel
+	$NS_EXEC tc filter add dev lo parent 1: protocol arp basic \
+		action mirred egress redirect dev veth1
+	$NS_EXEC tc filter add dev lo parent 1: protocol ip basic \
+		action mirred egress redirect dev veth1
+
+	# 4. (ns2) redirect veth2's ingress to lo's ingress
+	ip netns exec ns2 tc qdisc add dev veth2 ingress
+	ip netns exec ns2 tc filter add dev veth2 ingress protocol arp basic \
+		action mirred ingress redirect dev lo
+	ip netns exec ns2 tc filter add dev veth2 ingress protocol ip basic \
+		action mirred ingress redirect dev lo
+
 	$NS_EXEC sysctl -qw net.ipv4.conf.all.rp_filter=1
 	$NS_EXEC sysctl -qw net.ipv4.conf.all.accept_local=1
 	$NS_EXEC sysctl -qw net.ipv4.conf.all.route_localnet=1
-
-	$NS_EXEC tc qd add dev dummy1 parent root handle 1: fq_codel
-	$NS_EXEC tc filter add dev dummy1 parent 1: protocol arp basic action mirred egress redirect dev lo
-	$NS_EXEC tc filter add dev dummy1 parent 1: protocol ip basic action mirred egress redirect dev lo
+	ip netns exec ns2 sysctl -qw net.ipv4.conf.all.rp_filter=1
+	ip netns exec ns2 sysctl -qw net.ipv4.conf.all.accept_local=1
+	ip netns exec ns2 sysctl -qw net.ipv4.conf.all.route_localnet=1
 	set +e
 
-	run_cmd "ip netns exec ns1 ping -I dummy1 -w1 -c1 198.51.100.1"
+	run_cmd "ip netns exec ns2 ping -w1 -c1 192.0.2.1"
 	log_test $? 0 "rp_filter passes local packets"
 
-	run_cmd "ip netns exec ns1 ping -I dummy1 -w1 -c1 127.0.0.1"
+	run_cmd "ip netns exec ns2 ping -w1 -c1 127.0.0.1"
 	log_test $? 0 "rp_filter passes loopback packets"
 
 	cleanup
-- 
cgit v1.2.3


From 988f01683c7f2bf9f8fe2bae1cf4010fcd1baaf5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 2 Dec 2021 21:45:34 +0100
Subject: objtool: Fix pv_ops noinstr validation

Boris reported that in one of his randconfig builds, objtool got
infinitely stuck. Turns out there's trivial list corruption in the
pv_ops tracking when a function is both in a static table and in a code
assignment.

Avoid re-adding function to the pv_ops[] lists when they're already on
it.

Fixes: db2b0c5d7b6f ("objtool: Support pv_opsindirect calls for noinstr")
Reported-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Tested-by: Borislav Petkov <bp@alien8.de>
Link: https://lkml.kernel.org/r/20211202204534.GA16608@worktop.programming.kicks-ass.net
---
 tools/objtool/elf.c     | 1 +
 tools/objtool/objtool.c | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'tools')

diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
index 81a4c543ff7e..4b384c907027 100644
--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -375,6 +375,7 @@ static int read_symbols(struct elf *elf)
 			return -1;
 		}
 		memset(sym, 0, sizeof(*sym));
+		INIT_LIST_HEAD(&sym->pv_target);
 		sym->alias = sym;
 
 		sym->idx = i;
diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c
index c90c7084e45a..bdf699f6552b 100644
--- a/tools/objtool/objtool.c
+++ b/tools/objtool/objtool.c
@@ -153,6 +153,10 @@ void objtool_pv_add(struct objtool_file *f, int idx, struct symbol *func)
 	    !strcmp(func->name, "_paravirt_ident_64"))
 		return;
 
+	/* already added this function */
+	if (!list_empty(&func->pv_target))
+		return;
+
 	list_add(&func->pv_target, &f->pv_ops[idx].targets);
 	f->pv_ops[idx].clean = false;
 }
-- 
cgit v1.2.3


From 96f3896780153214040a6747974bebc1355307c0 Mon Sep 17 00:00:00 2001
From: Li Zhijian <zhijianx.li@intel.com>
Date: Fri, 3 Dec 2021 10:53:21 +0800
Subject: selftests/tc-testing: add exit code

Mark the summary result as FAIL to prevent from confusing the selftest
framework if some of them are failed.

Previously, the selftest framework always treats it as *ok* even though
some of them are failed actually. That's because the script tdc.sh always
return 0.

 # All test results:
 #
 # 1..97
 # ok 1 83be - Create FQ-PIE with invalid number of flows
 # ok 2 8b6e - Create RED with no flags
[...snip]
 # ok 6 5f15 - Create RED with flags ECN, harddrop
 # ok 7 53e8 - Create RED with flags ECN, nodrop
 # ok 8 d091 - Fail to create RED with only nodrop flag
 # ok 9 af8e - Create RED with flags ECN, nodrop, harddrop
 # not ok 10 ce7d - Add mq Qdisc to multi-queue device (4 queues)
 #       Could not match regex pattern. Verify command output:
 # qdisc mq 1: root
 # qdisc fq_codel 0: parent 1:4 limit 10240p flows 1024 quantum 1514 target 5ms interval 100ms memory_limit 32Mb ecn drop_batch 64
 # qdisc fq_codel 0: parent 1:3 limit 10240p flows 1024 quantum 1514 target 5ms interval 100ms memory_limit 32Mb ecn drop_batch 64
[...snip]
 # ok 96 6979 - Change quantum of a strict ETS band
 # ok 97 9a7d - Change ETS strict band without quantum
 #
 #
 #
 #
 ok 1 selftests: tc-testing: tdc.sh <<< summary result

CC: Philip Li <philip.li@intel.com>
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Li Zhijian <zhijianx.li@intel.com>
Acked-by: Davide Caratti <dcaratti@redhat.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/tc-testing/tdc.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/tc-testing/tdc.py b/tools/testing/selftests/tc-testing/tdc.py
index a3e43189d940..ee22e3447ec7 100755
--- a/tools/testing/selftests/tc-testing/tdc.py
+++ b/tools/testing/selftests/tc-testing/tdc.py
@@ -716,6 +716,7 @@ def set_operation_mode(pm, parser, args, remaining):
         list_test_cases(alltests)
         exit(0)
 
+    exit_code = 0 # KSFT_PASS
     if len(alltests):
         req_plugins = pm.get_required_plugins(alltests)
         try:
@@ -724,6 +725,8 @@ def set_operation_mode(pm, parser, args, remaining):
             print('The following plugins were not found:')
             print('{}'.format(pde.missing_pg))
         catresults = test_runner(pm, args, alltests)
+        if catresults.count_failures() != 0:
+            exit_code = 1 # KSFT_FAIL
         if args.format == 'none':
             print('Test results output suppression requested\n')
         else:
@@ -748,6 +751,8 @@ def set_operation_mode(pm, parser, args, remaining):
                         gid=int(os.getenv('SUDO_GID')))
     else:
         print('No tests found\n')
+        exit_code = 4 # KSFT_SKIP
+    exit(exit_code)
 
 def main():
     """
@@ -767,8 +772,5 @@ def main():
 
     set_operation_mode(pm, parser, args, remaining)
 
-    exit(0)
-
-
 if __name__ == "__main__":
     main()
-- 
cgit v1.2.3


From a8c9505c53c5f1f0aba572a4c70e2d91ad08434e Mon Sep 17 00:00:00 2001
From: Li Zhijian <zhijianx.li@intel.com>
Date: Fri, 3 Dec 2021 10:53:22 +0800
Subject: selftests/tc-testing: add missing config

qdiscs/fq_pie requires CONFIG_NET_SCH_FQ_PIE, otherwise tc will fail
to create a fq_pie qdisc.

It fixes following issue:
 # not ok 57 83be - Create FQ-PIE with invalid number of flows
 #       Command exited with 2, expected 0
 # Error: Specified qdisc not found.

Signed-off-by: Li Zhijian <zhijianx.li@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/tc-testing/config | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/tc-testing/config b/tools/testing/selftests/tc-testing/config
index b71828df5a6d..b1cd7efa4512 100644
--- a/tools/testing/selftests/tc-testing/config
+++ b/tools/testing/selftests/tc-testing/config
@@ -60,6 +60,7 @@ CONFIG_NET_IFE_SKBTCINDEX=m
 CONFIG_NET_SCH_FIFO=y
 CONFIG_NET_SCH_ETS=m
 CONFIG_NET_SCH_RED=m
+CONFIG_NET_SCH_FQ_PIE=m
 
 #
 ## Network testing
-- 
cgit v1.2.3


From db925bca33a9f0029a9891defd926a4856dd5c87 Mon Sep 17 00:00:00 2001
From: Li Zhijian <zhijianx.li@intel.com>
Date: Fri, 3 Dec 2021 10:53:23 +0800
Subject: selftests/tc-testing: Fix cannot create
 /sys/bus/netdevsim/new_device: Directory nonexistent

Install netdevsim to provide /sys/bus/netdevsim/new_device interface.

It helps to fix:
 # ok 97 9a7d - Change ETS strict band without quantum # skipped - skipped - previous setup failed 11 ce7d
 #
 #
 # -----> prepare stage *** Could not execute: "echo "1 1 4" > /sys/bus/netdevsim/new_device"
 #
 # -----> prepare stage *** Error message: "/bin/sh: 1: cannot create /sys/bus/netdevsim/new_device: Directory nonexistent
 # "
 #
 # -----> prepare stage *** Aborting test run.
 #
 #
 # <_io.BufferedReader name=5> *** stdout ***
 #

Signed-off-by: Li Zhijian <zhijianx.li@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/tc-testing/config | 1 +
 tools/testing/selftests/tc-testing/tdc.sh | 1 +
 2 files changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/tc-testing/config b/tools/testing/selftests/tc-testing/config
index b1cd7efa4512..a3239d5e40c7 100644
--- a/tools/testing/selftests/tc-testing/config
+++ b/tools/testing/selftests/tc-testing/config
@@ -61,6 +61,7 @@ CONFIG_NET_SCH_FIFO=y
 CONFIG_NET_SCH_ETS=m
 CONFIG_NET_SCH_RED=m
 CONFIG_NET_SCH_FQ_PIE=m
+CONFIG_NETDEVSIM=m
 
 #
 ## Network testing
diff --git a/tools/testing/selftests/tc-testing/tdc.sh b/tools/testing/selftests/tc-testing/tdc.sh
index 7fe38c76db44..afb0cd86fa3d 100755
--- a/tools/testing/selftests/tc-testing/tdc.sh
+++ b/tools/testing/selftests/tc-testing/tdc.sh
@@ -1,5 +1,6 @@
 #!/bin/sh
 # SPDX-License-Identifier: GPL-2.0
 
+modprobe netdevsim
 ./tdc.py -c actions --nobuildebpf
 ./tdc.py -c qdisc
-- 
cgit v1.2.3


From 0f8a3b48f91b8dc1f3eff06b77a63a17183fccbd Mon Sep 17 00:00:00 2001
From: Li Zhijian <zhijianx.li@intel.com>
Date: Fri, 3 Dec 2021 10:32:13 +0800
Subject: selftests: net/fcnal-test.sh: add exit code

Previously, the selftest framework always treats it as *ok* even though
some of them are failed actually. That's because the script always
returns 0.

It supports PASS/FAIL/SKIP exit code now.

CC: Philip Li <philip.li@intel.com>
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Li Zhijian <zhijianx.li@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/fcnal-test.sh | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh
index 7f5b265fcb90..a1da013d847b 100755
--- a/tools/testing/selftests/net/fcnal-test.sh
+++ b/tools/testing/selftests/net/fcnal-test.sh
@@ -4077,3 +4077,11 @@ cleanup 2>/dev/null
 
 printf "\nTests passed: %3d\n" ${nsuccess}
 printf "Tests failed: %3d\n"   ${nfail}
+
+if [ $nfail -ne 0 ]; then
+	exit 1 # KSFT_FAIL
+elif [ $nsuccess -eq 0 ]; then
+	exit $ksft_skip
+fi
+
+exit 0 # KSFT_PASS
-- 
cgit v1.2.3


From 0bf40542c05ef62997738cd45eea553415adb045 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 2 Dec 2021 16:46:40 -0800
Subject: perf: Mute libbpf API deprecations temporarily

Libbpf development version was bumped to 0.7 in c93faaaf2f67
("libbpf: Deprecate bpf_prog_load_xattr() API"), activating a bunch of
previously scheduled deprecations. Most APIs are pretty straightforward
to replace with newer APIs, but perf has a complicated mixed setup with
libbpf used both as static and shared configurations, which makes it
non-trivial to migrate the APIs.

Further, bpf_program__set_prep() needs more involved refactoring, which
will require help from Arnaldo and/or Jiri.

So for now, mute deprecation warnings and work on migrating perf off of
deprecated APIs separately with the input from owners of the perf tool.

Fixes: c93faaaf2f67 ("libbpf: Deprecate bpf_prog_load_xattr() API")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211203004640.2455717-1-andrii@kernel.org
---
 tools/perf/tests/bpf.c       | 4 ++++
 tools/perf/util/bpf-loader.c | 3 +++
 2 files changed, 7 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c
index 2bf146e49ce8..c52bf10f746e 100644
--- a/tools/perf/tests/bpf.c
+++ b/tools/perf/tests/bpf.c
@@ -312,9 +312,13 @@ static int check_env(void)
 		return err;
 	}
 
+/* temporarily disable libbpf deprecation warnings */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 	err = bpf_load_program(BPF_PROG_TYPE_KPROBE, insns,
 			       sizeof(insns) / sizeof(insns[0]),
 			       license, kver_int, NULL, 0);
+#pragma GCC diagnostic pop
 	if (err < 0) {
 		pr_err("Missing basic BPF support, skip this test: %s\n",
 		       strerror(errno));
diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index fbb3c4057c30..528aeb0ab79d 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -29,6 +29,9 @@
 
 #include <internal/xyarray.h>
 
+/* temporarily disable libbpf deprecation warnings */
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
 static int libbpf_perf_print(enum libbpf_print_level level __attribute__((unused)),
 			      const char *fmt, va_list args)
 {
-- 
cgit v1.2.3


From 2fa7d94afc1afbb4d702760c058dc2d7ed30f226 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Tue, 30 Nov 2021 20:16:07 +0200
Subject: bpf: Fix the off-by-two error in range markings

The first commit cited below attempts to fix the off-by-one error that
appeared in some comparisons with an open range. Due to this error,
arithmetically equivalent pieces of code could get different verdicts
from the verifier, for example (pseudocode):

  // 1. Passes the verifier:
  if (data + 8 > data_end)
      return early
  read *(u64 *)data, i.e. [data; data+7]

  // 2. Rejected by the verifier (should still pass):
  if (data + 7 >= data_end)
      return early
  read *(u64 *)data, i.e. [data; data+7]

The attempted fix, however, shifts the range by one in a wrong
direction, so the bug not only remains, but also such piece of code
starts failing in the verifier:

  // 3. Rejected by the verifier, but the check is stricter than in #1.
  if (data + 8 >= data_end)
      return early
  read *(u64 *)data, i.e. [data; data+7]

The change performed by that fix converted an off-by-one bug into
off-by-two. The second commit cited below added the BPF selftests
written to ensure than code chunks like #3 are rejected, however,
they should be accepted.

This commit fixes the off-by-two error by adjusting new_range in the
right direction and fixes the tests by changing the range into the
one that should actually fail.

Fixes: fb2a311a31d3 ("bpf: fix off by one for range markings with L{T, E} patterns")
Fixes: b37242c773b2 ("bpf: add test cases to bpf selftests to cover all access tests")
Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211130181607.593149-1-maximmi@nvidia.com
---
 kernel/bpf/verifier.c                              |  2 +-
 .../bpf/verifier/xdp_direct_packet_access.c        | 32 +++++++++++-----------
 2 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'tools')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 50efda51515b..f3001937bbb9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -8422,7 +8422,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
 
 	new_range = dst_reg->off;
 	if (range_right_open)
-		new_range--;
+		new_range++;
 
 	/* Examples for register markings:
 	 *
diff --git a/tools/testing/selftests/bpf/verifier/xdp_direct_packet_access.c b/tools/testing/selftests/bpf/verifier/xdp_direct_packet_access.c
index bfb97383e6b5..de172a5b8754 100644
--- a/tools/testing/selftests/bpf/verifier/xdp_direct_packet_access.c
+++ b/tools/testing/selftests/bpf/verifier/xdp_direct_packet_access.c
@@ -112,10 +112,10 @@
 	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
 		    offsetof(struct xdp_md, data_end)),
 	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
-	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6),
 	BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_1, 1),
 	BPF_JMP_IMM(BPF_JA, 0, 0, 1),
-	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6),
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
@@ -167,10 +167,10 @@
 	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
 		    offsetof(struct xdp_md, data_end)),
 	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
-	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6),
 	BPF_JMP_REG(BPF_JLT, BPF_REG_1, BPF_REG_3, 1),
 	BPF_JMP_IMM(BPF_JA, 0, 0, 1),
-	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6),
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
@@ -274,9 +274,9 @@
 	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
 		    offsetof(struct xdp_md, data_end)),
 	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
-	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6),
 	BPF_JMP_REG(BPF_JGE, BPF_REG_1, BPF_REG_3, 1),
-	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6),
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
@@ -437,9 +437,9 @@
 	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
 		    offsetof(struct xdp_md, data_end)),
 	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
-	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6),
 	BPF_JMP_REG(BPF_JLE, BPF_REG_3, BPF_REG_1, 1),
-	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6),
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
@@ -544,10 +544,10 @@
 		    offsetof(struct xdp_md, data_meta)),
 	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
 	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
-	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6),
 	BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_1, 1),
 	BPF_JMP_IMM(BPF_JA, 0, 0, 1),
-	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6),
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
@@ -599,10 +599,10 @@
 		    offsetof(struct xdp_md, data_meta)),
 	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
 	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
-	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6),
 	BPF_JMP_REG(BPF_JLT, BPF_REG_1, BPF_REG_3, 1),
 	BPF_JMP_IMM(BPF_JA, 0, 0, 1),
-	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6),
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
@@ -706,9 +706,9 @@
 		    offsetof(struct xdp_md, data_meta)),
 	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
 	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
-	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6),
 	BPF_JMP_REG(BPF_JGE, BPF_REG_1, BPF_REG_3, 1),
-	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6),
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
@@ -869,9 +869,9 @@
 		    offsetof(struct xdp_md, data_meta)),
 	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
 	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
-	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6),
 	BPF_JMP_REG(BPF_JLE, BPF_REG_3, BPF_REG_1, 1),
-	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6),
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
-- 
cgit v1.2.3


From 78c1f8d0634cc35da613d844eda7c849fc50f643 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Fri, 3 Dec 2021 10:28:36 -0800
Subject: libbpf: Reduce bpf_core_apply_relo_insn() stack usage.

Reduce bpf_core_apply_relo_insn() stack usage and bump
BPF_CORE_SPEC_MAX_LEN limit back to 64.

Fixes: 29db4bea1d10 ("bpf: Prepare relo_core.c for kernel duty.")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211203182836.16646-1-alexei.starovoitov@gmail.com
---
 kernel/bpf/btf.c          | 11 ++++++++-
 tools/lib/bpf/libbpf.c    |  4 ++-
 tools/lib/bpf/relo_core.c | 62 +++++++++++++++--------------------------------
 tools/lib/bpf/relo_core.h | 30 ++++++++++++++++++++++-
 4 files changed, 61 insertions(+), 46 deletions(-)

(limited to 'tools')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index ed4258cb0832..2a902a946f70 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6742,8 +6742,16 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
 {
 	bool need_cands = relo->kind != BPF_CORE_TYPE_ID_LOCAL;
 	struct bpf_core_cand_list cands = {};
+	struct bpf_core_spec *specs;
 	int err;
 
+	/* ~4k of temp memory necessary to convert LLVM spec like "0:1:0:5"
+	 * into arrays of btf_ids of struct fields and array indices.
+	 */
+	specs = kcalloc(3, sizeof(*specs), GFP_KERNEL);
+	if (!specs)
+		return -ENOMEM;
+
 	if (need_cands) {
 		struct bpf_cand_cache *cc;
 		int i;
@@ -6779,8 +6787,9 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
 	}
 
 	err = bpf_core_apply_relo_insn((void *)ctx->log, insn, relo->insn_off / 8,
-				       relo, relo_idx, ctx->btf, &cands);
+				       relo, relo_idx, ctx->btf, &cands, specs);
 out:
+	kfree(specs);
 	if (need_cands) {
 		kfree(cands.cands);
 		mutex_unlock(&cand_cache_mutex);
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index de260c94e418..6db0b5e8540e 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -5515,6 +5515,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog,
 			       const struct btf *local_btf,
 			       struct hashmap *cand_cache)
 {
+	struct bpf_core_spec specs_scratch[3] = {};
 	const void *type_key = u32_as_hash_key(relo->type_id);
 	struct bpf_core_cand_list *cands = NULL;
 	const char *prog_name = prog->name;
@@ -5569,7 +5570,8 @@ static int bpf_core_apply_relo(struct bpf_program *prog,
 		}
 	}
 
-	return bpf_core_apply_relo_insn(prog_name, insn, insn_idx, relo, relo_idx, local_btf, cands);
+	return bpf_core_apply_relo_insn(prog_name, insn, insn_idx, relo,
+					relo_idx, local_btf, cands, specs_scratch);
 }
 
 static int
diff --git a/tools/lib/bpf/relo_core.c b/tools/lib/bpf/relo_core.c
index d194fb9306ed..32464f0ab4b1 100644
--- a/tools/lib/bpf/relo_core.c
+++ b/tools/lib/bpf/relo_core.c
@@ -68,33 +68,6 @@ enum libbpf_print_level {
 #include "libbpf_internal.h"
 #endif
 
-#define BPF_CORE_SPEC_MAX_LEN 32
-
-/* represents BPF CO-RE field or array element accessor */
-struct bpf_core_accessor {
-	__u32 type_id;		/* struct/union type or array element type */
-	__u32 idx;		/* field index or array index */
-	const char *name;	/* field name or NULL for array accessor */
-};
-
-struct bpf_core_spec {
-	const struct btf *btf;
-	/* high-level spec: named fields and array indices only */
-	struct bpf_core_accessor spec[BPF_CORE_SPEC_MAX_LEN];
-	/* original unresolved (no skip_mods_or_typedefs) root type ID */
-	__u32 root_type_id;
-	/* CO-RE relocation kind */
-	enum bpf_core_relo_kind relo_kind;
-	/* high-level spec length */
-	int len;
-	/* raw, low-level spec: 1-to-1 with accessor spec string */
-	int raw_spec[BPF_CORE_SPEC_MAX_LEN];
-	/* raw spec length */
-	int raw_len;
-	/* field bit offset represented by spec */
-	__u32 bit_offset;
-};
-
 static bool is_flex_arr(const struct btf *btf,
 			const struct bpf_core_accessor *acc,
 			const struct btf_array *arr)
@@ -1200,9 +1173,12 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn,
 			     const struct bpf_core_relo *relo,
 			     int relo_idx,
 			     const struct btf *local_btf,
-			     struct bpf_core_cand_list *cands)
+			     struct bpf_core_cand_list *cands,
+			     struct bpf_core_spec *specs_scratch)
 {
-	struct bpf_core_spec local_spec, cand_spec, targ_spec = {};
+	struct bpf_core_spec *local_spec = &specs_scratch[0];
+	struct bpf_core_spec *cand_spec = &specs_scratch[1];
+	struct bpf_core_spec *targ_spec = &specs_scratch[2];
 	struct bpf_core_relo_res cand_res, targ_res;
 	const struct btf_type *local_type;
 	const char *local_name;
@@ -1221,7 +1197,7 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn,
 		return -EINVAL;
 
 	err = bpf_core_parse_spec(prog_name, local_btf, local_id, spec_str,
-				  relo->kind, &local_spec);
+				  relo->kind, local_spec);
 	if (err) {
 		pr_warn("prog '%s': relo #%d: parsing [%d] %s %s + %s failed: %d\n",
 			prog_name, relo_idx, local_id, btf_kind_str(local_type),
@@ -1232,15 +1208,15 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn,
 
 	pr_debug("prog '%s': relo #%d: kind <%s> (%d), spec is ", prog_name,
 		 relo_idx, core_relo_kind_str(relo->kind), relo->kind);
-	bpf_core_dump_spec(prog_name, LIBBPF_DEBUG, &local_spec);
+	bpf_core_dump_spec(prog_name, LIBBPF_DEBUG, local_spec);
 	libbpf_print(LIBBPF_DEBUG, "\n");
 
 	/* TYPE_ID_LOCAL relo is special and doesn't need candidate search */
 	if (relo->kind == BPF_CORE_TYPE_ID_LOCAL) {
 		targ_res.validate = true;
 		targ_res.poison = false;
-		targ_res.orig_val = local_spec.root_type_id;
-		targ_res.new_val = local_spec.root_type_id;
+		targ_res.orig_val = local_spec->root_type_id;
+		targ_res.new_val = local_spec->root_type_id;
 		goto patch_insn;
 	}
 
@@ -1253,38 +1229,38 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn,
 
 
 	for (i = 0, j = 0; i < cands->len; i++) {
-		err = bpf_core_spec_match(&local_spec, cands->cands[i].btf,
-					  cands->cands[i].id, &cand_spec);
+		err = bpf_core_spec_match(local_spec, cands->cands[i].btf,
+					  cands->cands[i].id, cand_spec);
 		if (err < 0) {
 			pr_warn("prog '%s': relo #%d: error matching candidate #%d ",
 				prog_name, relo_idx, i);
-			bpf_core_dump_spec(prog_name, LIBBPF_WARN, &cand_spec);
+			bpf_core_dump_spec(prog_name, LIBBPF_WARN, cand_spec);
 			libbpf_print(LIBBPF_WARN, ": %d\n", err);
 			return err;
 		}
 
 		pr_debug("prog '%s': relo #%d: %s candidate #%d ", prog_name,
 			 relo_idx, err == 0 ? "non-matching" : "matching", i);
-		bpf_core_dump_spec(prog_name, LIBBPF_DEBUG, &cand_spec);
+		bpf_core_dump_spec(prog_name, LIBBPF_DEBUG, cand_spec);
 		libbpf_print(LIBBPF_DEBUG, "\n");
 
 		if (err == 0)
 			continue;
 
-		err = bpf_core_calc_relo(prog_name, relo, relo_idx, &local_spec, &cand_spec, &cand_res);
+		err = bpf_core_calc_relo(prog_name, relo, relo_idx, local_spec, cand_spec, &cand_res);
 		if (err)
 			return err;
 
 		if (j == 0) {
 			targ_res = cand_res;
-			targ_spec = cand_spec;
-		} else if (cand_spec.bit_offset != targ_spec.bit_offset) {
+			*targ_spec = *cand_spec;
+		} else if (cand_spec->bit_offset != targ_spec->bit_offset) {
 			/* if there are many field relo candidates, they
 			 * should all resolve to the same bit offset
 			 */
 			pr_warn("prog '%s': relo #%d: field offset ambiguity: %u != %u\n",
-				prog_name, relo_idx, cand_spec.bit_offset,
-				targ_spec.bit_offset);
+				prog_name, relo_idx, cand_spec->bit_offset,
+				targ_spec->bit_offset);
 			return -EINVAL;
 		} else if (cand_res.poison != targ_res.poison || cand_res.new_val != targ_res.new_val) {
 			/* all candidates should result in the same relocation
@@ -1328,7 +1304,7 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn,
 			 prog_name, relo_idx);
 
 		/* calculate single target relo result explicitly */
-		err = bpf_core_calc_relo(prog_name, relo, relo_idx, &local_spec, NULL, &targ_res);
+		err = bpf_core_calc_relo(prog_name, relo, relo_idx, local_spec, NULL, &targ_res);
 		if (err)
 			return err;
 	}
diff --git a/tools/lib/bpf/relo_core.h b/tools/lib/bpf/relo_core.h
index 4f864b8e33b7..17799819ad7c 100644
--- a/tools/lib/bpf/relo_core.h
+++ b/tools/lib/bpf/relo_core.h
@@ -17,11 +17,39 @@ struct bpf_core_cand_list {
 	int len;
 };
 
+#define BPF_CORE_SPEC_MAX_LEN 64
+
+/* represents BPF CO-RE field or array element accessor */
+struct bpf_core_accessor {
+	__u32 type_id;		/* struct/union type or array element type */
+	__u32 idx;		/* field index or array index */
+	const char *name;	/* field name or NULL for array accessor */
+};
+
+struct bpf_core_spec {
+	const struct btf *btf;
+	/* high-level spec: named fields and array indices only */
+	struct bpf_core_accessor spec[BPF_CORE_SPEC_MAX_LEN];
+	/* original unresolved (no skip_mods_or_typedefs) root type ID */
+	__u32 root_type_id;
+	/* CO-RE relocation kind */
+	enum bpf_core_relo_kind relo_kind;
+	/* high-level spec length */
+	int len;
+	/* raw, low-level spec: 1-to-1 with accessor spec string */
+	int raw_spec[BPF_CORE_SPEC_MAX_LEN];
+	/* raw spec length */
+	int raw_len;
+	/* field bit offset represented by spec */
+	__u32 bit_offset;
+};
+
 int bpf_core_apply_relo_insn(const char *prog_name,
 			     struct bpf_insn *insn, int insn_idx,
 			     const struct bpf_core_relo *relo, int relo_idx,
 			     const struct btf *local_btf,
-			     struct bpf_core_cand_list *cands);
+			     struct bpf_core_cand_list *cands,
+			     struct bpf_core_spec *specs_scratch);
 int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
 			      const struct btf *targ_btf, __u32 targ_id);
 
-- 
cgit v1.2.3


From da54ab14953c38d98cb3e34c564c06c3739394b2 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Tue, 30 Nov 2021 20:18:11 +0200
Subject: bpf: Fix the test_task_vma selftest to support output shorter than 1
 kB

The test for bpf_iter_task_vma assumes that the output will be longer
than 1 kB, as the comment above the loop says. Due to this assumption,
the loop becomes infinite if the output turns to be shorter than 1 kB.
The return value of read_fd_into_buffer is 0 when the end of file was
reached, and len isn't being increased any more.

This commit adds a break on EOF to handle short output correctly. For
the reference, this is the contents that I get when running test_progs
under vmtest.sh, and it's shorter than 1 kB:

00400000-00401000 r--p 00000000 fe:00 25867     /root/bpf/test_progs
00401000-00674000 r-xp 00001000 fe:00 25867     /root/bpf/test_progs
00674000-0095f000 r--p 00274000 fe:00 25867     /root/bpf/test_progs
0095f000-00983000 r--p 0055e000 fe:00 25867     /root/bpf/test_progs
00983000-00a8a000 rw-p 00582000 fe:00 25867     /root/bpf/test_progs
00a8a000-0484e000 rw-p 00000000 00:00 0
7f6c64000000-7f6c64021000 rw-p 00000000 00:00 0
7f6c64021000-7f6c68000000 ---p 00000000 00:00 0
7f6c6ac8f000-7f6c6ac90000 r--s 00000000 00:0d 8032
anon_inode:bpf-map
7f6c6ac90000-7f6c6ac91000 ---p 00000000 00:00 0
7f6c6ac91000-7f6c6b491000 rw-p 00000000 00:00 0
7f6c6b491000-7f6c6b492000 r--s 00000000 00:0d 8032
anon_inode:bpf-map
7f6c6b492000-7f6c6b493000 rw-s 00000000 00:0d 8032
anon_inode:bpf-map
7ffc1e23d000-7ffc1e25e000 rw-p 00000000 00:00 0
7ffc1e3b8000-7ffc1e3bc000 r--p 00000000 00:00 0
7ffc1e3bc000-7ffc1e3bd000 r-xp 00000000 00:00 0
7fffffffe000-7ffffffff000 --xp 00000000 00:00 0

Fixes: e8168840e16c ("selftests/bpf: Add test for bpf_iter_task_vma")
Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211130181811.594220-1-maximmi@nvidia.com
---
 tools/testing/selftests/bpf/prog_tests/bpf_iter.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
index 0b996be923b5..b84f859b1267 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
@@ -1206,13 +1206,14 @@ static void test_task_vma(void)
 		goto out;
 
 	/* Read CMP_BUFFER_SIZE (1kB) from bpf_iter. Read in small chunks
-	 * to trigger seq_file corner cases. The expected output is much
-	 * longer than 1kB, so the while loop will terminate.
+	 * to trigger seq_file corner cases.
 	 */
 	len = 0;
 	while (len < CMP_BUFFER_SIZE) {
 		err = read_fd_into_buffer(iter_fd, task_vma_output + len,
 					  min(read_size, CMP_BUFFER_SIZE - len));
+		if (!err)
+			break;
 		if (CHECK(err < 0, "read_iter_fd", "read_iter_fd failed\n"))
 			goto out;
 		len += err;
-- 
cgit v1.2.3


From b80892ca022e9eb484771a66eb68e12364695a2a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 Oct 2021 17:10:17 +0200
Subject: memremap: remove support for external pgmap refcounts

No driver is left using the external pgmap refcount, so remove the
code to support it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/20211028151017.50234-1-hch@lst.de
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/pci/p2pdma.c              |  2 +-
 include/linux/memremap.h          | 18 ++----------
 mm/memremap.c                     | 59 ++++++++-------------------------------
 tools/testing/nvdimm/test/iomap.c | 43 ++++++++--------------------
 4 files changed, 28 insertions(+), 94 deletions(-)

(limited to 'tools')

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 8d47cb7218d1..454d5f6f16ff 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -219,7 +219,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
 	error = gen_pool_add_owner(p2pdma->pool, (unsigned long)addr,
 			pci_bus_address(pdev, bar) + offset,
 			range_len(&pgmap->range), dev_to_node(&pdev->dev),
-			pgmap->ref);
+			&pgmap->ref);
 	if (error)
 		goto pages_free;
 
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index c0e9d35889e8..a8bc588fe7aa 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -72,16 +72,6 @@ struct dev_pagemap_ops {
 	 */
 	void (*page_free)(struct page *page);
 
-	/*
-	 * Transition the refcount in struct dev_pagemap to the dead state.
-	 */
-	void (*kill)(struct dev_pagemap *pgmap);
-
-	/*
-	 * Wait for refcount in struct dev_pagemap to be idle and reap it.
-	 */
-	void (*cleanup)(struct dev_pagemap *pgmap);
-
 	/*
 	 * Used for private (un-addressable) device memory only.  Must migrate
 	 * the page back to a CPU accessible page.
@@ -95,8 +85,7 @@ struct dev_pagemap_ops {
  * struct dev_pagemap - metadata for ZONE_DEVICE mappings
  * @altmap: pre-allocated/reserved memory for vmemmap allocations
  * @ref: reference count that pins the devm_memremap_pages() mapping
- * @internal_ref: internal reference if @ref is not provided by the caller
- * @done: completion for @internal_ref
+ * @done: completion for @ref
  * @type: memory type: see MEMORY_* in memory_hotplug.h
  * @flags: PGMAP_* flags to specify defailed behavior
  * @ops: method table
@@ -109,8 +98,7 @@ struct dev_pagemap_ops {
  */
 struct dev_pagemap {
 	struct vmem_altmap altmap;
-	struct percpu_ref *ref;
-	struct percpu_ref internal_ref;
+	struct percpu_ref ref;
 	struct completion done;
 	enum memory_type type;
 	unsigned int flags;
@@ -191,7 +179,7 @@ static inline unsigned long memremap_compat_align(void)
 static inline void put_dev_pagemap(struct dev_pagemap *pgmap)
 {
 	if (pgmap)
-		percpu_ref_put(pgmap->ref);
+		percpu_ref_put(&pgmap->ref);
 }
 
 #endif /* _LINUX_MEMREMAP_H_ */
diff --git a/mm/memremap.c b/mm/memremap.c
index 5a66a71ab591..643965da13a6 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -112,30 +112,6 @@ static unsigned long pfn_next(unsigned long pfn)
 #define for_each_device_pfn(pfn, map, i) \
 	for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn))
 
-static void dev_pagemap_kill(struct dev_pagemap *pgmap)
-{
-	if (pgmap->ops && pgmap->ops->kill)
-		pgmap->ops->kill(pgmap);
-	else
-		percpu_ref_kill(pgmap->ref);
-}
-
-static void dev_pagemap_cleanup(struct dev_pagemap *pgmap)
-{
-	if (pgmap->ops && pgmap->ops->cleanup) {
-		pgmap->ops->cleanup(pgmap);
-	} else {
-		wait_for_completion(&pgmap->done);
-		percpu_ref_exit(pgmap->ref);
-	}
-	/*
-	 * Undo the pgmap ref assignment for the internal case as the
-	 * caller may re-enable the same pgmap.
-	 */
-	if (pgmap->ref == &pgmap->internal_ref)
-		pgmap->ref = NULL;
-}
-
 static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
 {
 	struct range *range = &pgmap->ranges[range_id];
@@ -167,11 +143,12 @@ void memunmap_pages(struct dev_pagemap *pgmap)
 	unsigned long pfn;
 	int i;
 
-	dev_pagemap_kill(pgmap);
+	percpu_ref_kill(&pgmap->ref);
 	for (i = 0; i < pgmap->nr_range; i++)
 		for_each_device_pfn(pfn, pgmap, i)
 			put_page(pfn_to_page(pfn));
-	dev_pagemap_cleanup(pgmap);
+	wait_for_completion(&pgmap->done);
+	percpu_ref_exit(&pgmap->ref);
 
 	for (i = 0; i < pgmap->nr_range; i++)
 		pageunmap_range(pgmap, i);
@@ -188,8 +165,7 @@ static void devm_memremap_pages_release(void *data)
 
 static void dev_pagemap_percpu_release(struct percpu_ref *ref)
 {
-	struct dev_pagemap *pgmap =
-		container_of(ref, struct dev_pagemap, internal_ref);
+	struct dev_pagemap *pgmap = container_of(ref, struct dev_pagemap, ref);
 
 	complete(&pgmap->done);
 }
@@ -295,8 +271,8 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
 	memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
 				PHYS_PFN(range->start),
 				PHYS_PFN(range_len(range)), pgmap);
-	percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id)
-			- pfn_first(pgmap, range_id));
+	percpu_ref_get_many(&pgmap->ref,
+		pfn_end(pgmap, range_id) - pfn_first(pgmap, range_id));
 	return 0;
 
 err_add_memory:
@@ -362,22 +338,11 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
 		break;
 	}
 
-	if (!pgmap->ref) {
-		if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup))
-			return ERR_PTR(-EINVAL);
-
-		init_completion(&pgmap->done);
-		error = percpu_ref_init(&pgmap->internal_ref,
-				dev_pagemap_percpu_release, 0, GFP_KERNEL);
-		if (error)
-			return ERR_PTR(error);
-		pgmap->ref = &pgmap->internal_ref;
-	} else {
-		if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) {
-			WARN(1, "Missing reference count teardown definition\n");
-			return ERR_PTR(-EINVAL);
-		}
-	}
+	init_completion(&pgmap->done);
+	error = percpu_ref_init(&pgmap->ref, dev_pagemap_percpu_release, 0,
+				GFP_KERNEL);
+	if (error)
+		return ERR_PTR(error);
 
 	devmap_managed_enable_get(pgmap);
 
@@ -486,7 +451,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
 	/* fall back to slow path lookup */
 	rcu_read_lock();
 	pgmap = xa_load(&pgmap_array, PHYS_PFN(phys));
-	if (pgmap && !percpu_ref_tryget_live(pgmap->ref))
+	if (pgmap && !percpu_ref_tryget_live(&pgmap->ref))
 		pgmap = NULL;
 	rcu_read_unlock();
 
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
index ed563bdd88f3..b752ce47ead3 100644
--- a/tools/testing/nvdimm/test/iomap.c
+++ b/tools/testing/nvdimm/test/iomap.c
@@ -100,25 +100,17 @@ static void nfit_test_kill(void *_pgmap)
 {
 	struct dev_pagemap *pgmap = _pgmap;
 
-	WARN_ON(!pgmap || !pgmap->ref);
-
-	if (pgmap->ops && pgmap->ops->kill)
-		pgmap->ops->kill(pgmap);
-	else
-		percpu_ref_kill(pgmap->ref);
-
-	if (pgmap->ops && pgmap->ops->cleanup) {
-		pgmap->ops->cleanup(pgmap);
-	} else {
-		wait_for_completion(&pgmap->done);
-		percpu_ref_exit(pgmap->ref);
-	}
+	WARN_ON(!pgmap);
+
+	percpu_ref_kill(&pgmap->ref);
+
+	wait_for_completion(&pgmap->done);
+	percpu_ref_exit(&pgmap->ref);
 }
 
 static void dev_pagemap_percpu_release(struct percpu_ref *ref)
 {
-	struct dev_pagemap *pgmap =
-		container_of(ref, struct dev_pagemap, internal_ref);
+	struct dev_pagemap *pgmap = container_of(ref, struct dev_pagemap, ref);
 
 	complete(&pgmap->done);
 }
@@ -132,22 +124,11 @@ void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 	if (!nfit_res)
 		return devm_memremap_pages(dev, pgmap);
 
-	if (!pgmap->ref) {
-		if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup))
-			return ERR_PTR(-EINVAL);
-
-		init_completion(&pgmap->done);
-		error = percpu_ref_init(&pgmap->internal_ref,
-				dev_pagemap_percpu_release, 0, GFP_KERNEL);
-		if (error)
-			return ERR_PTR(error);
-		pgmap->ref = &pgmap->internal_ref;
-	} else {
-		if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) {
-			WARN(1, "Missing reference count teardown definition\n");
-			return ERR_PTR(-EINVAL);
-		}
-	}
+	init_completion(&pgmap->done);
+	error = percpu_ref_init(&pgmap->ref, dev_pagemap_percpu_release, 0,
+				GFP_KERNEL);
+	if (error)
+		return ERR_PTR(error);
 
 	error = devm_add_action_or_reset(dev, nfit_test_kill, pgmap);
 	if (error)
-- 
cgit v1.2.3


From 942df4dc5ea159100466f198d8687a49c2359ca3 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Sat, 4 Dec 2021 11:46:23 -0800
Subject: bpftool: Add debug mode for gen_loader.

Make -d flag functional for gen_loader style program loading.

For example:
$ bpftool prog load -L -d test_d_path.o
... // will print:
libbpf: loading ./test_d_path.o
libbpf: elf: section(3) fentry/security_inode_getattr, size 280, link 0, flags 6, type=1
...
libbpf: prog 'prog_close': found data map 0 (test_d_p.bss, sec 7, off 0) for insn 30
libbpf: gen: load_btf: size 5376
libbpf: gen: map_create: test_d_p.bss idx 0 type 2 value_type_id 118
libbpf: map 'test_d_p.bss': created successfully, fd=0
libbpf: gen: map_update_elem: idx 0
libbpf: sec 'fentry/filp_close': found 1 CO-RE relocations
libbpf: record_relo_core: prog 1 insn[15] struct file 0:1 final insn_idx 15
libbpf: gen: prog_load: type 26 insns_cnt 35 progi_idx 0
libbpf: gen: find_attach_tgt security_inode_getattr 12
libbpf: gen: prog_load: type 26 insns_cnt 37 progi_idx 1
libbpf: gen: find_attach_tgt filp_close 12
libbpf: gen: finish 0
... // at this point libbpf finished generating loader program
   0: (bf) r6 = r1
   1: (bf) r1 = r10
   2: (07) r1 += -136
   3: (b7) r2 = 136
   4: (b7) r3 = 0
   5: (85) call bpf_probe_read_kernel#113
   6: (05) goto pc+104
... // this is the assembly dump of the loader program
 390: (63) *(u32 *)(r6 +44) = r0
 391: (18) r1 = map[idx:0]+5584
 393: (61) r0 = *(u32 *)(r1 +0)
 394: (63) *(u32 *)(r6 +24) = r0
 395: (b7) r0 = 0
 396: (95) exit
err 0  // the loader program was loaded and executed successfully
(null)
func#0 @0
...  // CO-RE in the kernel logs:
CO-RE relocating STRUCT file: found target candidate [500]
prog '': relo #0: kind <byte_off> (0), spec is [8] STRUCT file.f_path (0:1 @ offset 16)
prog '': relo #0: matching candidate #0 [500] STRUCT file.f_path (0:1 @ offset 16)
prog '': relo #0: patched insn #15 (ALU/ALU64) imm 16 -> 16
vmlinux_cand_cache:[11]file(500),
module_cand_cache:
... // verifier logs when it was checking test_d_path.o program:
R1 type=ctx expected=fp
0: R1=ctx(id=0,off=0,imm=0) R10=fp0
; int BPF_PROG(prog_close, struct file *file, void *id)
0: (79) r6 = *(u64 *)(r1 +0)
func 'filp_close' arg0 has btf_id 500 type STRUCT 'file'
1: R1=ctx(id=0,off=0,imm=0) R6_w=ptr_file(id=0,off=0,imm=0) R10=fp0
; pid_t pid = bpf_get_current_pid_tgid() >> 32;
1: (85) call bpf_get_current_pid_tgid#14

... // if there are multiple programs being loaded by the loader program
... // only the last program in the elf file will be printed, since
... // the same verifier log_buf is used for all PROG_LOAD commands.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211204194623.27779-1-alexei.starovoitov@gmail.com
---
 tools/bpf/bpftool/prog.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index e47e8b06cc3d..45ccc254e69f 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -1774,17 +1774,19 @@ static int try_loader(struct gen_loader_opts *gen)
 					     sizeof(struct bpf_prog_desc));
 	int log_buf_sz = (1u << 24) - 1;
 	int err, fds_before, fd_delta;
-	char *log_buf;
+	char *log_buf = NULL;
 
 	ctx = alloca(ctx_sz);
 	memset(ctx, 0, ctx_sz);
 	ctx->sz = ctx_sz;
-	ctx->log_level = 1;
-	ctx->log_size = log_buf_sz;
-	log_buf = malloc(log_buf_sz);
-	if (!log_buf)
-		return -ENOMEM;
-	ctx->log_buf = (long) log_buf;
+	if (verifier_logs) {
+		ctx->log_level = 1 + 2 + 4;
+		ctx->log_size = log_buf_sz;
+		log_buf = malloc(log_buf_sz);
+		if (!log_buf)
+			return -ENOMEM;
+		ctx->log_buf = (long) log_buf;
+	}
 	opts.ctx = ctx;
 	opts.data = gen->data;
 	opts.data_sz = gen->data_sz;
@@ -1793,9 +1795,9 @@ static int try_loader(struct gen_loader_opts *gen)
 	fds_before = count_open_fds();
 	err = bpf_load_and_run(&opts);
 	fd_delta = count_open_fds() - fds_before;
-	if (err < 0) {
+	if (err < 0 || verifier_logs) {
 		fprintf(stderr, "err %d\n%s\n%s", err, opts.errstr, log_buf);
-		if (fd_delta)
+		if (fd_delta && err < 0)
 			fprintf(stderr, "loader prog leaked %d FDs\n",
 				fd_delta);
 	}
-- 
cgit v1.2.3


From cd7729043b31b32b34daaa6775e255b76de7f976 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Mon, 22 Nov 2021 18:30:40 +0900
Subject: libtraceevent: Add __rel_loc relative location attribute support

Add '__rel_loc' new dynamic data location attribute which encodes
the data location from the next to the field itself. This is similar
to the '__data_loc' but the location offset is not from the event
entry but from the next of the field.

This patch adds '__rel_loc' decoding support in the libtraceevent.

Link: https://lkml.kernel.org/r/163757343994.510314.13241077597729303802.stgit@devnote2

Cc: Beau Belgrave <beaub@linux.microsoft.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 tools/lib/traceevent/event-parse.c  | 59 +++++++++++++++++++++++++------------
 tools/lib/traceevent/event-parse.h  |  5 ++--
 tools/lib/traceevent/parse-filter.c |  5 +++-
 3 files changed, 47 insertions(+), 22 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c
index fe58843d047c..8e24c4c78c7f 100644
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -1367,6 +1367,14 @@ static int field_is_dynamic(struct tep_format_field *field)
 	return 0;
 }
 
+static int field_is_relative_dynamic(struct tep_format_field *field)
+{
+	if (strncmp(field->type, "__rel_loc", 9) == 0)
+		return 1;
+
+	return 0;
+}
+
 static int field_is_long(struct tep_format_field *field)
 {
 	/* includes long long */
@@ -1622,6 +1630,8 @@ static int event_read_fields(struct tep_event *event, struct tep_format_field **
 			field->flags |= TEP_FIELD_IS_STRING;
 		if (field_is_dynamic(field))
 			field->flags |= TEP_FIELD_IS_DYNAMIC;
+		if (field_is_relative_dynamic(field))
+			field->flags |= TEP_FIELD_IS_DYNAMIC | TEP_FIELD_IS_RELATIVE;
 		if (field_is_long(field))
 			field->flags |= TEP_FIELD_IS_LONG;
 
@@ -2928,7 +2938,7 @@ process_str(struct tep_event *event __maybe_unused, struct tep_print_arg *arg,
 
 	arg->type = TEP_PRINT_STRING;
 	arg->string.string = token;
-	arg->string.offset = -1;
+	arg->string.field = NULL;
 
 	if (read_expected(TEP_EVENT_DELIM, ")") < 0)
 		goto out_err;
@@ -2957,7 +2967,7 @@ process_bitmask(struct tep_event *event __maybe_unused, struct tep_print_arg *ar
 
 	arg->type = TEP_PRINT_BITMASK;
 	arg->bitmask.bitmask = token;
-	arg->bitmask.offset = -1;
+	arg->bitmask.field = NULL;
 
 	if (read_expected(TEP_EVENT_DELIM, ")") < 0)
 		goto out_err;
@@ -3123,19 +3133,23 @@ process_function(struct tep_event *event, struct tep_print_arg *arg,
 		free_token(token);
 		return process_int_array(event, arg, tok);
 	}
-	if (strcmp(token, "__get_str") == 0) {
+	if (strcmp(token, "__get_str") == 0 ||
+	    strcmp(token, "__get_rel_str") == 0) {
 		free_token(token);
 		return process_str(event, arg, tok);
 	}
-	if (strcmp(token, "__get_bitmask") == 0) {
+	if (strcmp(token, "__get_bitmask") == 0 ||
+	    strcmp(token, "__get_rel_bitmask") == 0) {
 		free_token(token);
 		return process_bitmask(event, arg, tok);
 	}
-	if (strcmp(token, "__get_dynamic_array") == 0) {
+	if (strcmp(token, "__get_dynamic_array") == 0 ||
+	    strcmp(token, "__get_rel_dynamic_array") == 0) {
 		free_token(token);
 		return process_dynamic_array(event, arg, tok);
 	}
-	if (strcmp(token, "__get_dynamic_array_len") == 0) {
+	if (strcmp(token, "__get_dynamic_array_len") == 0 ||
+	    strcmp(token, "__get_rel_dynamic_array_len") == 0) {
 		free_token(token);
 		return process_dynamic_array_len(event, arg, tok);
 	}
@@ -4163,14 +4177,16 @@ static void print_str_arg(struct trace_seq *s, void *data, int size,
 	case TEP_PRINT_STRING: {
 		int str_offset;
 
-		if (arg->string.offset == -1) {
-			struct tep_format_field *f;
+		if (!arg->string.field)
+			arg->string.field = tep_find_any_field(event, arg->string.string);
+		if (!arg->string.field)
+			break;
 
-			f = tep_find_any_field(event, arg->string.string);
-			arg->string.offset = f->offset;
-		}
-		str_offset = data2host4(tep, *(unsigned int *)(data + arg->string.offset));
+		str_offset = data2host4(tep,
+				*(unsigned int *)(data + arg->string.field->offset));
 		str_offset &= 0xffff;
+		if (arg->string.field->flags & TEP_FIELD_IS_RELATIVE)
+			str_offset += arg->string.field->offset + arg->string.field->size;
 		print_str_to_seq(s, format, len_arg, ((char *)data) + str_offset);
 		break;
 	}
@@ -4181,15 +4197,16 @@ static void print_str_arg(struct trace_seq *s, void *data, int size,
 		int bitmask_offset;
 		int bitmask_size;
 
-		if (arg->bitmask.offset == -1) {
-			struct tep_format_field *f;
-
-			f = tep_find_any_field(event, arg->bitmask.bitmask);
-			arg->bitmask.offset = f->offset;
-		}
-		bitmask_offset = data2host4(tep, *(unsigned int *)(data + arg->bitmask.offset));
+		if (!arg->bitmask.field)
+			arg->bitmask.field = tep_find_any_field(event, arg->bitmask.bitmask);
+		if (!arg->bitmask.field)
+			break;
+		bitmask_offset = data2host4(tep,
+				*(unsigned int *)(data + arg->bitmask.field->offset));
 		bitmask_size = bitmask_offset >> 16;
 		bitmask_offset &= 0xffff;
+		if (arg->bitmask.field->flags & TEP_FIELD_IS_RELATIVE)
+			bitmask_offset += arg->bitmask.field->offset + arg->bitmask.field->size;
 		print_bitmask_to_seq(tep, s, format, len_arg,
 				     data + bitmask_offset, bitmask_size);
 		break;
@@ -5109,6 +5126,8 @@ void tep_print_field(struct trace_seq *s, void *data,
 			offset = val;
 			len = offset >> 16;
 			offset &= 0xffff;
+			if (field->flags & TEP_FIELD_IS_RELATIVE)
+				offset += field->offset + field->size;
 		}
 		if (field->flags & TEP_FIELD_IS_STRING &&
 		    is_printable_array(data + offset, len)) {
@@ -6987,6 +7006,8 @@ void *tep_get_field_raw(struct trace_seq *s, struct tep_event *event,
 					 data + offset, field->size);
 		*len = offset >> 16;
 		offset &= 0xffff;
+		if (field->flags & TEP_FIELD_IS_RELATIVE)
+			offset += field->offset + field->size;
 	} else
 		*len = field->size;
 
diff --git a/tools/lib/traceevent/event-parse.h b/tools/lib/traceevent/event-parse.h
index a67ad9a5b835..41d4f9f6a843 100644
--- a/tools/lib/traceevent/event-parse.h
+++ b/tools/lib/traceevent/event-parse.h
@@ -125,6 +125,7 @@ enum tep_format_flags {
 	TEP_FIELD_IS_LONG	= 32,
 	TEP_FIELD_IS_FLAG	= 64,
 	TEP_FIELD_IS_SYMBOLIC	= 128,
+	TEP_FIELD_IS_RELATIVE	= 256,
 };
 
 struct tep_format_field {
@@ -153,12 +154,12 @@ struct tep_print_arg_atom {
 
 struct tep_print_arg_string {
 	char			*string;
-	int			offset;
+	struct tep_format_field *field;
 };
 
 struct tep_print_arg_bitmask {
 	char			*bitmask;
-	int			offset;
+	struct tep_format_field *field;
 };
 
 struct tep_print_arg_field {
diff --git a/tools/lib/traceevent/parse-filter.c b/tools/lib/traceevent/parse-filter.c
index 368826bb5a57..5df177070d53 100644
--- a/tools/lib/traceevent/parse-filter.c
+++ b/tools/lib/traceevent/parse-filter.c
@@ -1712,8 +1712,11 @@ static const char *get_field_str(struct tep_filter_arg *arg, struct tep_record *
 
 		if (arg->str.field->flags & TEP_FIELD_IS_DYNAMIC) {
 			addr = *(unsigned int *)val;
-			val = record->data + (addr & 0xffff);
 			size = addr >> 16;
+			addr &= 0xffff;
+			if (arg->str.field->flags & TEP_FIELD_IS_RELATIVE)
+				addr += arg->str.field->offset + arg->str.field->size;
+			val = record->data + addr;
 		}
 
 		/*
-- 
cgit v1.2.3


From 7c689c839734a23eda855e69a56ed4795533bf71 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Mon, 22 Nov 2021 18:30:48 +0900
Subject: tools/perf: Add '__rel_loc' event field parsing support

Add new '__rel_loc' dynamic data location attribute support.
This type attribute is similar to the '__data_loc' but records the
offset from the field itself.
The libtraceevent adds TEP_FIELD_IS_RELATIVE to the
'tep_format_field::flags' with TEP_FIELD_IS_DYNAMIC for'__rel_loc'.

Link: https://lkml.kernel.org/r/163757344810.510314.12449413842136229871.stgit@devnote2

Cc: Beau Belgrave <beaub@linux.microsoft.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 tools/perf/builtin-trace.c                             | 2 ++
 tools/perf/util/data-convert-bt.c                      | 2 ++
 tools/perf/util/evsel.c                                | 2 ++
 tools/perf/util/python.c                               | 2 ++
 tools/perf/util/scripting-engines/trace-event-perl.c   | 2 ++
 tools/perf/util/scripting-engines/trace-event-python.c | 2 ++
 tools/perf/util/sort.c                                 | 2 ++
 7 files changed, 14 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 0b52e08e558e..ea517762cb0c 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -2726,6 +2726,8 @@ static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel,
 				offset = format_field__intval(field, sample, evsel->needs_swap);
 				syscall_arg.len = offset >> 16;
 				offset &= 0xffff;
+				if (field->flags & TEP_FIELD_IS_RELATIVE)
+					offset += field->offset + field->size;
 			}
 
 			val = (uintptr_t)(sample->raw_data + offset);
diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c
index 8f7705bbc2da..9e0aee276df8 100644
--- a/tools/perf/util/data-convert-bt.c
+++ b/tools/perf/util/data-convert-bt.c
@@ -318,6 +318,8 @@ static int add_tracepoint_field_value(struct ctf_writer *cw,
 		offset = tmp_val;
 		len = offset >> 16;
 		offset &= 0xffff;
+		if (flags & TEP_FIELD_IS_RELATIVE)
+			offset += fmtf->offset + fmtf->size;
 	}
 
 	if (flags & TEP_FIELD_IS_ARRAY) {
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index ac0127be0459..f29d37004f55 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -2706,6 +2706,8 @@ void *evsel__rawptr(struct evsel *evsel, struct perf_sample *sample, const char
 	if (field->flags & TEP_FIELD_IS_DYNAMIC) {
 		offset = *(int *)(sample->raw_data + field->offset);
 		offset &= 0xffff;
+		if (field->flags & TEP_FIELD_IS_RELATIVE)
+			offset += field->offset + field->size;
 	}
 
 	return sample->raw_data + offset;
diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c
index 563a9ba8954f..3e7538db9107 100644
--- a/tools/perf/util/python.c
+++ b/tools/perf/util/python.c
@@ -428,6 +428,8 @@ tracepoint_field(struct pyrf_event *pe, struct tep_format_field *field)
 			offset  = val;
 			len     = offset >> 16;
 			offset &= 0xffff;
+			if (field->flags & TEP_FIELD_IS_RELATIVE)
+				offset += field->offset + field->size;
 		}
 		if (field->flags & TEP_FIELD_IS_STRING &&
 		    is_printable_array(data + offset, len)) {
diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c
index 32a721b3e9a5..a5d945415bbc 100644
--- a/tools/perf/util/scripting-engines/trace-event-perl.c
+++ b/tools/perf/util/scripting-engines/trace-event-perl.c
@@ -392,6 +392,8 @@ static void perl_process_tracepoint(struct perf_sample *sample,
 			if (field->flags & TEP_FIELD_IS_DYNAMIC) {
 				offset = *(int *)(data + field->offset);
 				offset &= 0xffff;
+				if (field->flags & TEP_FIELD_IS_RELATIVE)
+					offset += field->offset + field->size;
 			} else
 				offset = field->offset;
 			XPUSHs(sv_2mortal(newSVpv((char *)data + offset, 0)));
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index c0c010350bc2..d1f1501ce7fc 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -942,6 +942,8 @@ static void python_process_tracepoint(struct perf_sample *sample,
 				offset  = val;
 				len     = offset >> 16;
 				offset &= 0xffff;
+				if (field->flags & TEP_FIELD_IS_RELATIVE)
+					offset += field->offset + field->size;
 			}
 			if (field->flags & TEP_FIELD_IS_STRING &&
 			    is_printable_array(data + offset, len)) {
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index a111065b484e..d9a106f0edb2 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -2365,6 +2365,8 @@ static int64_t __sort__hde_cmp(struct perf_hpp_fmt *fmt,
 		tep_read_number_field(field, a->raw_data, &dyn);
 		offset = dyn & 0xffff;
 		size = (dyn >> 16) & 0xffff;
+		if (field->flags & TEP_FIELD_IS_RELATIVE)
+			offset += field->offset + field->size;
 
 		/* record max width for output */
 		if (size > hde->dynamic_len)
-- 
cgit v1.2.3


From 222c98c7979084fbefb4ce2ae377210c6e42011e Mon Sep 17 00:00:00 2001
From: huangxuesen <huangxuesen@kuaishou.com>
Date: Mon, 6 Dec 2021 09:47:16 +0800
Subject: libbpf: Fix trivial typo

Fix typo in comment from 'bpf_skeleton_map' to 'bpf_map_skeleton'
and from 'bpf_skeleton_prog' to 'bpf_prog_skeleton'.

Signed-off-by: huangxuesen <huangxuesen@kuaishou.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1638755236-3851199-1-git-send-email-hxseverything@gmail.com
---
 tools/lib/bpf/libbpf.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index c0d62dd37c5d..2fa046a96142 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -1038,11 +1038,11 @@ struct bpf_object_skeleton {
 	struct bpf_object **obj;
 
 	int map_cnt;
-	int map_skel_sz; /* sizeof(struct bpf_skeleton_map) */
+	int map_skel_sz; /* sizeof(struct bpf_map_skeleton) */
 	struct bpf_map_skeleton *maps;
 
 	int prog_cnt;
-	int prog_skel_sz; /* sizeof(struct bpf_skeleton_prog) */
+	int prog_skel_sz; /* sizeof(struct bpf_prog_skeleton) */
 	struct bpf_prog_skeleton *progs;
 };
 
-- 
cgit v1.2.3


From d5284dedccdb9053988278dd30c834d46b8c866d Mon Sep 17 00:00:00 2001
From: Grant Seltzer <grantseltzer@gmail.com>
Date: Mon, 6 Dec 2021 15:37:09 -0500
Subject: libbpf: Add doc comments in libbpf.h

This adds comments above functions in libbpf.h which document
their uses. These comments are of a format that doxygen and sphinx
can pick up and render. These are rendered by libbpf.readthedocs.org

These doc comments are for:

- bpf_object__open_file()
- bpf_object__open_mem()
- bpf_program__attach_uprobe()
- bpf_program__attach_uprobe_opts()

Signed-off-by: Grant Seltzer <grantseltzer@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211206203709.332530-1-grantseltzer@gmail.com
---
 tools/lib/bpf/libbpf.h | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 2fa046a96142..4802c1e736c3 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -112,8 +112,30 @@ struct bpf_object_open_opts {
 #define bpf_object_open_opts__last_field btf_custom_path
 
 LIBBPF_API struct bpf_object *bpf_object__open(const char *path);
+
+/**
+ * @brief **bpf_object__open_file()** creates a bpf_object by opening
+ * the BPF ELF object file pointed to by the passed path and loading it
+ * into memory.
+ * @param path BPF object file path
+ * @param opts options for how to load the bpf object, this parameter is
+ * optional and can be set to NULL
+ * @return pointer to the new bpf_object; or NULL is returned on error,
+ * error code is stored in errno
+ */
 LIBBPF_API struct bpf_object *
 bpf_object__open_file(const char *path, const struct bpf_object_open_opts *opts);
+
+/**
+ * @brief **bpf_object__open_mem()** creates a bpf_object by reading
+ * the BPF objects raw bytes from a memory buffer containing a valid
+ * BPF ELF object file.
+ * @param obj_buf pointer to the buffer containing ELF file bytes
+ * @param obj_buf_sz number of bytes in the buffer
+ * @param opts options for how to load the bpf object
+ * @return pointer to the new bpf_object; or NULL is returned on error,
+ * error code is stored in errno
+ */
 LIBBPF_API struct bpf_object *
 bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz,
 		     const struct bpf_object_open_opts *opts);
@@ -348,10 +370,41 @@ struct bpf_uprobe_opts {
 };
 #define bpf_uprobe_opts__last_field retprobe
 
+/**
+ * @brief **bpf_program__attach_uprobe()** attaches a BPF program
+ * to the userspace function which is found by binary path and
+ * offset. You can optionally specify a particular proccess to attach
+ * to. You can also optionally attach the program to the function
+ * exit instead of entry.
+ *
+ * @param prog BPF program to attach
+ * @param retprobe Attach to function exit
+ * @param pid Process ID to attach the uprobe to, 0 for self (own process),
+ * -1 for all processes
+ * @param binary_path Path to binary that contains the function symbol
+ * @param func_offset Offset within the binary of the function symbol
+ * @return Reference to the newly created BPF link; or NULL is returned on error,
+ * error code is stored in errno
+ */
 LIBBPF_API struct bpf_link *
 bpf_program__attach_uprobe(const struct bpf_program *prog, bool retprobe,
 			   pid_t pid, const char *binary_path,
 			   size_t func_offset);
+
+/**
+ * @brief **bpf_program__attach_uprobe_opts()** is just like
+ * bpf_program__attach_uprobe() except with a options struct
+ * for various configurations.
+ *
+ * @param prog BPF program to attach
+ * @param pid Process ID to attach the uprobe to, 0 for self (own process),
+ * -1 for all processes
+ * @param binary_path Path to binary that contains the function symbol
+ * @param func_offset Offset within the binary of the function symbol
+ * @param opts Options for altering program attachment
+ * @return Reference to the newly created BPF link; or NULL is returned on error,
+ * error code is stored in errno
+ */
 LIBBPF_API struct bpf_link *
 bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid,
 				const char *binary_path, size_t func_offset,
-- 
cgit v1.2.3


From 3f8d6577163f9903d4af5e5c0f90eb42944a8851 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Mon, 22 Nov 2021 10:11:05 -0300
Subject: Revert "perf bench: Fix two memory leaks detected with ASan"

This: This reverts commit 92723ea0f11d92496687db8c9725248e9d1e5e1d.

  # perf test 91
  91: perf stat --bpf-counters test           :RRRRRRRRRRRRR FAILED!
  # perf test 91
  91: perf stat --bpf-counters test           :RRRRRRRRRRRRR FAILED!
  # perf test 91
  91: perf stat --bpf-counters test           :RRRRRRRRRRRR FAILED!
  # perf test 91
  91: perf stat --bpf-counters test           :RRRRRRRRRRRRRRRRRR Ok
  # perf test 91
  91: perf stat --bpf-counters test           :RRRRRRRRR FAILED!
  # perf test 91
  91: perf stat --bpf-counters test           :RRRRRRRRRRR Ok
  # perf test 91
  91: perf stat --bpf-counters test           :RRRRRRRRRRRRRRR Ok

yep, it seems the perf bench is broken so the counts won't correlated if
I revert this one:

  92723ea0f11d perf bench: Fix two memory leaks detected with ASan

it works for me again.. it seems to break -t option

   [root@dell-r440-01 perf]# ./perf bench sched messaging -g 1 -l 100 -t
   # Running 'sched/messaging' benchmark:
   RRRperf: CLIENT: ready write: Bad file descriptor
   Rperf: SENDER: write: Bad file descriptor

Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Sohaib Mohamed <sohaib.amhmd@gmail.com>
Cc: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/lkml/YZev7KClb%2Fud43Lc@krava/
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/bench/sched-messaging.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/bench/sched-messaging.c b/tools/perf/bench/sched-messaging.c
index fa0ff4ce2b74..488f6e6ba1a5 100644
--- a/tools/perf/bench/sched-messaging.c
+++ b/tools/perf/bench/sched-messaging.c
@@ -223,8 +223,6 @@ static unsigned int group(pthread_t *pth,
 		snd_ctx->out_fds[i] = fds[1];
 		if (!thread_mode)
 			close(fds[0]);
-
-		free(ctx);
 	}
 
 	/* Now we have all the fds, fork the senders */
@@ -241,8 +239,6 @@ static unsigned int group(pthread_t *pth,
 		for (i = 0; i < num_fds; i++)
 			close(snd_ctx->out_fds[i]);
 
-	free(snd_ctx);
-
 	/* Return number of children to reap */
 	return num_fds * 2;
 }
-- 
cgit v1.2.3


From 71a16df164b23210d4dcaf35c70825f47d7c5599 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 8 Sep 2021 16:09:08 -0300
Subject: tools headers UAPI: Sync s390 syscall table file changed by new
 futex_waitv syscall

To pick the changes in these csets:

  6c122360cf2f4c5a ("s390: wire up sys_futex_waitv system call")

That add support for this new syscall in tools such as 'perf trace'.

For instance, this is now possible (adapted from the x86_64 test output):

  # perf trace -e futex_waitv
  ^C#
  # perf trace -v -e futex_waitv
  event qualifier tracepoint filter: (common_pid != 807333 && common_pid != 3564) && (id == 449)
  ^C#
  # perf trace -v -e futex* --max-events 10
  event qualifier tracepoint filter: (common_pid != 812168 && common_pid != 3564) && (id == 238 || id == 449)
           ? (         ): Timer/219310  ... [continued]: futex())                                            = -1 ETIMEDOUT (Connection timed out)
       0.012 ( 0.002 ms): Timer/219310 futex(uaddr: 0x7fd0b152d3c8, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
       0.024 ( 0.060 ms): Timer/219310 futex(uaddr: 0x7fd0b152d420, op: WAIT_BITSET|PRIVATE_FLAG, utime: 0x7fd0b1657840, val3: MATCH_ANY) = 0
       0.086 ( 0.001 ms): Timer/219310 futex(uaddr: 0x7fd0b152d3c8, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
       0.088 (         ): Timer/219310 futex(uaddr: 0x7fd0b152d424, op: WAIT_BITSET|PRIVATE_FLAG, utime: 0x7fd0b1657840, val3: MATCH_ANY) ...
       0.075 ( 0.005 ms): Web Content/219299 futex(uaddr: 0x7fd0b152d420, op: WAKE|PRIVATE_FLAG, val: 1)     = 1
       0.169 ( 0.004 ms): Web Content/219299 futex(uaddr: 0x7fd0b152d424, op: WAKE|PRIVATE_FLAG, val: 1)     = 1
       0.088 ( 0.089 ms): Timer/219310  ... [continued]: futex())                                            = 0
       0.179 ( 0.001 ms): Timer/219310 futex(uaddr: 0x7fd0b152d3c8, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
       0.181 (         ): Timer/219310 futex(uaddr: 0x7fd0b152d420, op: WAIT_BITSET|PRIVATE_FLAG, utime: 0x7fd0b1657840, val3: MATCH_ANY) ...
  #

That is the filter expression attached to the raw_syscalls:sys_{enter,exit}
tracepoints.

  $ grep futex tools/perf/arch/s390/entry/syscalls/syscall.tbl
  238  common	futex			sys_futex			sys_futex_time32
  422	32	futex_time64		-				sys_futex
  449  common	futex_waitv		sys_futex_waitv			sys_futex_waitv
  $

This addresses this perf build warnings:

  Warning: Kernel ABI header at 'tools/perf/arch/s390/entry/syscalls/syscall.tbl' differs from latest version at 'arch/s390/kernel/syscalls/syscall.tbl'
  diff -u tools/perf/arch/s390/entry/syscalls/syscall.tbl arch/s390/kernel/syscalls/syscall.tbl

Acked-by: Heiko Carstens <hca@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>,
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Link: https://lore.kernel.org/lkml/YZ%2F2qRW%2FTScYTP1U@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/s390/entry/syscalls/syscall.tbl | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
index df5261e5cfe1..ed9c5c2eafad 100644
--- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
@@ -451,3 +451,4 @@
 446  common	landlock_restrict_self	sys_landlock_restrict_self	sys_landlock_restrict_self
 # 447 reserved for memfd_secret
 448  common	process_mrelease	sys_process_mrelease		sys_process_mrelease
+449  common	futex_waitv		sys_futex_waitv			sys_futex_waitv
-- 
cgit v1.2.3


From c29d9792607e67ed8a3f6e9db0d96836d885a8c5 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Thu, 25 Nov 2021 09:14:57 +0200
Subject: perf inject: Fix itrace space allowed for new attributes

The space allowed for new attributes can be too small if existing header
information is large. That can happen, for example, if there are very
many CPUs, due to having an event ID per CPU per event being stored in the
header information.

Fix by adding the existing header.data_offset. Also increase the extra
space allowed to 8KiB and align to a 4KiB boundary for neatness.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Link: http://lore.kernel.org/lkml/20211125071457.2066863-1-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-inject.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index bc5259db5fd9..b9d6306cc14e 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -820,7 +820,7 @@ static int __cmd_inject(struct perf_inject *inject)
 		inject->tool.ordered_events = true;
 		inject->tool.ordering_requires_timestamps = true;
 		/* Allow space in the header for new attributes */
-		output_data_offset = 4096;
+		output_data_offset = roundup(8192 + session->header.data_offset, 4096);
 		if (inject->strip)
 			strip_init(inject);
 	}
-- 
cgit v1.2.3


From cba43fcf7aaf8369f80aac26cc2c50232c065a9e Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 8 Sep 2021 16:09:08 -0300
Subject: tools headers UAPI: Sync powerpc syscall table file changed by new
 futex_waitv syscall
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To pick the changes in this cset:

  a0eb2da92b715d0c ("futex: Wireup futex_waitv syscall")

That add support for this new syscall in tools such as 'perf trace'.

For instance, this is now possible (adapted from the x86_64 test output):

  # perf trace -e futex_waitv
  ^C#
  # perf trace -v -e futex_waitv
  event qualifier tracepoint filter: (common_pid != 807333 && common_pid != 3564) && (id == 449)
  ^C#
  # perf trace -v -e futex* --max-events 10
  event qualifier tracepoint filter: (common_pid != 812168 && common_pid != 3564) && (id == 221 || id == 449)
  mmap size 528384B
           ? (         ): Timer/219310  ... [continued]: futex())                                            = -1 ETIMEDOUT (Connection timed out)
       0.012 ( 0.002 ms): Timer/219310 futex(uaddr: 0x7fd0b152d3c8, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
       0.024 ( 0.060 ms): Timer/219310 futex(uaddr: 0x7fd0b152d420, op: WAIT_BITSET|PRIVATE_FLAG, utime: 0x7fd0b1657840, val3: MATCH_ANY) = 0
       0.086 ( 0.001 ms): Timer/219310 futex(uaddr: 0x7fd0b152d3c8, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
       0.088 (         ): Timer/219310 futex(uaddr: 0x7fd0b152d424, op: WAIT_BITSET|PRIVATE_FLAG, utime: 0x7fd0b1657840, val3: MATCH_ANY) ...
       0.075 ( 0.005 ms): Web Content/219299 futex(uaddr: 0x7fd0b152d420, op: WAKE|PRIVATE_FLAG, val: 1)     = 1
       0.169 ( 0.004 ms): Web Content/219299 futex(uaddr: 0x7fd0b152d424, op: WAKE|PRIVATE_FLAG, val: 1)     = 1
       0.088 ( 0.089 ms): Timer/219310  ... [continued]: futex())                                            = 0
       0.179 ( 0.001 ms): Timer/219310 futex(uaddr: 0x7fd0b152d3c8, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
       0.181 (         ): Timer/219310 futex(uaddr: 0x7fd0b152d420, op: WAIT_BITSET|PRIVATE_FLAG, utime: 0x7fd0b1657840, val3: MATCH_ANY) ...
  #

That is the filter expression attached to the raw_syscalls:sys_{enter,exit}
tracepoints.

  $ grep futex tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
  221	32	futex				sys_futex_time32
  221	64	futex				sys_futex
  221	spu	futex				sys_futex
  422	32	futex_time64			sys_futex			sys_futex
  449	common  futex_waitv                     sys_futex_waitv
  $

This addresses this perf build warnings:

  Warning: Kernel ABI header at 'tools/perf/arch/powerpc/entry/syscalls/syscall.tbl' differs from latest version at 'arch/powerpc/kernel/syscalls/syscall.tbl'
  diff -u tools/perf/arch/powerpc/entry/syscalls/syscall.tbl arch/powerpc/kernel/syscalls/syscall.tbl

Reviewed-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Tested-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>,
Cc: André Almeida <andrealmeid@collabora.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/lkml/YZ%2F1OU9mJuyS2HMa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/powerpc/entry/syscalls/syscall.tbl | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
index 7bef917cc84e..15109af9d075 100644
--- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
@@ -528,3 +528,4 @@
 446	common	landlock_restrict_self		sys_landlock_restrict_self
 # 447 reserved for memfd_secret
 448	common	process_mrelease		sys_process_mrelease
+449	common  futex_waitv                     sys_futex_waitv
-- 
cgit v1.2.3


From 4ffbe87e2d5b53bcb0213d8650bbe70bf942de6a Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 23 Nov 2021 16:12:29 -0800
Subject: perf tools: Fix SMT detection fast read path

sysfs__read_int() returns 0 on success, and so the fast read path was
always failing.

Fixes: bb629484d924118e ("perf tools: Simplify checking if SMT is active.")
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lore.kernel.org/lkml/20211124001231.3277836-2-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/smt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/smt.c b/tools/perf/util/smt.c
index 20bacd5972ad..34f1b1b1176c 100644
--- a/tools/perf/util/smt.c
+++ b/tools/perf/util/smt.c
@@ -15,7 +15,7 @@ int smt_on(void)
 	if (cached)
 		return cached_result;
 
-	if (sysfs__read_int("devices/system/cpu/smt/active", &cached_result) > 0)
+	if (sysfs__read_int("devices/system/cpu/smt/active", &cached_result) >= 0)
 		goto done;
 
 	ncpu = sysconf(_SC_NPROCESSORS_CONF);
-- 
cgit v1.2.3


From 3d1d57debee2d342a47615707588b96658fabb85 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 30 Nov 2021 10:12:41 -0300
Subject: tools build: Remove needless libpython-version feature check that
 breaks test-all fast path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since 66dfdff03d196e51 ("perf tools: Add Python 3 support") we don't use
the tools/build/feature/test-libpython-version.c version in any Makefile
feature check:

  $ find tools/ -type f | xargs grep feature-libpython-version
  $

The only place where this was used was removed in 66dfdff03d196e51:

  -        ifneq ($(feature-libpython-version), 1)
  -          $(warning Python 3 is not yet supported; please set)
  -          $(warning PYTHON and/or PYTHON_CONFIG appropriately.)
  -          $(warning If you also have Python 2 installed, then)
  -          $(warning try something like:)
  -          $(warning $(and ,))
  -          $(warning $(and ,)  make PYTHON=python2)
  -          $(warning $(and ,))
  -          $(warning Otherwise, disable Python support entirely:)
  -          $(warning $(and ,))
  -          $(warning $(and ,)  make NO_LIBPYTHON=1)
  -          $(warning $(and ,))
  -          $(error   $(and ,))
  -        else
  -          LDFLAGS += $(PYTHON_EMBED_LDFLAGS)
  -          EXTLIBS += $(PYTHON_EMBED_LIBADD)
  -          LANG_BINDINGS += $(obj-perf)python/perf.so
  -          $(call detected,CONFIG_LIBPYTHON)
  -        endif

And nowadays we either build with PYTHON=python3 or just install the
python3 devel packages and perf will build against it.

But the leftover feature-libpython-version check made the fast path
feature detection to break in all cases except when python2 devel files
were installed:

  $ rpm -qa | grep python.*devel
  python3-devel-3.9.7-1.fc34.x86_64
  $ rm -rf /tmp/build/perf ; mkdir -p /tmp/build/perf ;
  $ make -C tools/perf O=/tmp/build/perf install-bin
  make: Entering directory '/var/home/acme/git/perf/tools/perf'
    BUILD:   Doing 'make -j32' parallel build
    HOSTCC  /tmp/build/perf/fixdep.o
  <SNIP>
  $ cat /tmp/build/perf/feature/test-all.make.output
  In file included from test-all.c:18:
  test-libpython-version.c:5:10: error: #error
      5 |         #error
        |          ^~~~~
  $ ldd ~/bin/perf | grep python
	libpython3.9.so.1.0 => /lib64/libpython3.9.so.1.0 (0x00007fda6dbcf000)
  $

As python3 is the norm these days, fix this by just removing the unused
feature-libpython-version feature check, making the test-all fast path
to work with the common case.

With this:

  $ rm -rf /tmp/build/perf ; mkdir -p /tmp/build/perf ;
  $ make -C tools/perf O=/tmp/build/perf install-bin |& head
  make: Entering directory '/var/home/acme/git/perf/tools/perf'
    BUILD:   Doing 'make -j32' parallel build
    HOSTCC  /tmp/build/perf/fixdep.o
    HOSTLD  /tmp/build/perf/fixdep-in.o
    LINK    /tmp/build/perf/fixdep

  Auto-detecting system features:
  ...                         dwarf: [ on  ]
  ...            dwarf_getlocations: [ on  ]
  ...                         glibc: [ on  ]
  $ ldd ~/bin/perf | grep python
	libpython3.9.so.1.0 => /lib64/libpython3.9.so.1.0 (0x00007f58800b0000)
  $ cat /tmp/build/perf/feature/test-all.make.output
  $

Reviewed-by: James Clark <james.clark@arm.com>
Fixes: 66dfdff03d196e51 ("perf tools: Add Python 3 support")
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jaroslav Škarvada <jskarvad@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/lkml/YaYmeeC6CS2b8OSz@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/build/Makefile.feature                 |  1 -
 tools/build/feature/Makefile                 |  4 ----
 tools/build/feature/test-all.c               |  5 -----
 tools/build/feature/test-libpython-version.c | 11 -----------
 tools/perf/Makefile.config                   |  2 --
 5 files changed, 23 deletions(-)
 delete mode 100644 tools/build/feature/test-libpython-version.c

(limited to 'tools')

diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index 45a9a59828c3..ae61f464043a 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -48,7 +48,6 @@ FEATURE_TESTS_BASIC :=                  \
         numa_num_possible_cpus          \
         libperl                         \
         libpython                       \
-        libpython-version               \
         libslang                        \
         libslang-include-subdir         \
         libtraceevent                   \
diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index 0a3244ad9673..1480910c792e 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -32,7 +32,6 @@ FILES=                                          \
          test-numa_num_possible_cpus.bin        \
          test-libperl.bin                       \
          test-libpython.bin                     \
-         test-libpython-version.bin             \
          test-libslang.bin                      \
          test-libslang-include-subdir.bin       \
          test-libtraceevent.bin                 \
@@ -227,9 +226,6 @@ $(OUTPUT)test-libperl.bin:
 $(OUTPUT)test-libpython.bin:
 	$(BUILD) $(FLAGS_PYTHON_EMBED)
 
-$(OUTPUT)test-libpython-version.bin:
-	$(BUILD)
-
 $(OUTPUT)test-libbfd.bin:
 	$(BUILD) -DPACKAGE='"perf"' -lbfd -ldl
 
diff --git a/tools/build/feature/test-all.c b/tools/build/feature/test-all.c
index 0b243ce842be..5ffafb967b6e 100644
--- a/tools/build/feature/test-all.c
+++ b/tools/build/feature/test-all.c
@@ -14,10 +14,6 @@
 # include "test-libpython.c"
 #undef main
 
-#define main main_test_libpython_version
-# include "test-libpython-version.c"
-#undef main
-
 #define main main_test_libperl
 # include "test-libperl.c"
 #undef main
@@ -177,7 +173,6 @@
 int main(int argc, char *argv[])
 {
 	main_test_libpython();
-	main_test_libpython_version();
 	main_test_libperl();
 	main_test_hello();
 	main_test_libelf();
diff --git a/tools/build/feature/test-libpython-version.c b/tools/build/feature/test-libpython-version.c
deleted file mode 100644
index 47714b942d4d..000000000000
--- a/tools/build/feature/test-libpython-version.c
+++ /dev/null
@@ -1,11 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <Python.h>
-
-#if PY_VERSION_HEX >= 0x03000000
-	#error
-#endif
-
-int main(void)
-{
-	return 0;
-}
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index afd144725a0b..3df74cf5651a 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -271,8 +271,6 @@ endif
 
 FEATURE_CHECK_CFLAGS-libpython := $(PYTHON_EMBED_CCOPTS)
 FEATURE_CHECK_LDFLAGS-libpython := $(PYTHON_EMBED_LDOPTS)
-FEATURE_CHECK_CFLAGS-libpython-version := $(PYTHON_EMBED_CCOPTS)
-FEATURE_CHECK_LDFLAGS-libpython-version := $(PYTHON_EMBED_LDOPTS)
 
 FEATURE_CHECK_LDFLAGS-libaio = -lrt
 
-- 
cgit v1.2.3


From 6c481031c9f71e2b0ff9c49d21116a38ca671746 Mon Sep 17 00:00:00 2001
From: Thomas Richter <tmricht@linux.ibm.com>
Date: Mon, 29 Nov 2021 12:23:39 +0100
Subject: perf test: Fix 'Simple expression parser' test on arch without CPU
 die topology info

Some platforms do not have CPU die support, for example s390.

Commit
Cc: Ian Rogers <irogers@google.com>
Fixes: fdf1e29b6118c18f ("perf expr: Add metric literals for topology.")
fails on s390:

  # perf test -Fv 7
    ...
  # FAILED tests/expr.c:173 #num_dies >= #num_packages
    ---- end ----
    Simple expression parser: FAILED!
  #

Investigating this issue leads to these functions:

 build_cpu_topology()
   +--> has_die_topology(void)
        {
           struct utsname uts;

           if (uname(&uts) < 0)
                  return false;
           if (strncmp(uts.machine, "x86_64", 6))
                  return false;
           ....
        }

which always returns false on s390. The caller build_cpu_topology()
checks has_die_topology() return value. On false the the struct
cpu_topology::die_cpu_list is not contructed and has zero entries. This
leads to the failing comparison: #num_dies >= #num_packages.  s390 of
course has a positive number of packages.

Fix this and check if the function build_cpu_topology() did build up
a die_cpus_list. The number of entries in this list should be larger
than 0. If the number of list element is zero, the die_cpus_list has
not been created and the check in function test__expr():

    TEST_ASSERT_VAL("#num_dies >= #num_packages", \
		    num_dies >= num_packages)

always fails.

Output after:

  # perf test -Fv 7
   7: Simple expression parser                                        :
   --- start ---
   division by zero
   syntax error
   ---- end ----
   Simple expression parser: Ok
  #

Fixes: fdf1e29b6118c18f ("perf expr: Add metric literals for topology.")
Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Link: http://lore.kernel.org/lkml/20211129112339.3003036-1-tmricht@linux.ibm.com
[ Added comment in the added 'if (num_dies)' line about architectures not having die topology ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/expr.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/tests/expr.c b/tools/perf/tests/expr.c
index c895de481fe1..d54c5371c6a6 100644
--- a/tools/perf/tests/expr.c
+++ b/tools/perf/tests/expr.c
@@ -169,7 +169,9 @@ static int test__expr(struct test_suite *t __maybe_unused, int subtest __maybe_u
 	TEST_ASSERT_VAL("#num_dies", expr__parse(&num_dies, ctx, "#num_dies") == 0);
 	TEST_ASSERT_VAL("#num_cores >= #num_dies", num_cores >= num_dies);
 	TEST_ASSERT_VAL("#num_packages", expr__parse(&num_packages, ctx, "#num_packages") == 0);
-	TEST_ASSERT_VAL("#num_dies >= #num_packages", num_dies >= num_packages);
+
+	if (num_dies) // Some platforms do not have CPU die support, for example s390
+		TEST_ASSERT_VAL("#num_dies >= #num_packages", num_dies >= num_packages);
 
 	/*
 	 * Source count returns the number of events aggregating in a leader
-- 
cgit v1.2.3


From 1aa79e57730980dfbabd44e7c0a12fbb56064cb6 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Sun, 28 Nov 2021 00:58:10 -0800
Subject: perf test: Reset shadow counts before loading

Otherwise load counting is an average. Without this change
duration_time in test_memory_bandwidth will alter its value if an
earlier test contains duration_time.

This patch fixes an issue that's introduced in the proposed patch:
https://lore.kernel.org/lkml/20211124015226.3317994-1-irogers@google.com/
in perf test "Parse and process metrics".

Signed-off-by: Ian Rogers <irogers@google.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lore.kernel.org/lkml/20211128085810.4027314-1-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/parse-metric.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/perf/tests/parse-metric.c b/tools/perf/tests/parse-metric.c
index 574b7e4efd3a..07b6f4ec024f 100644
--- a/tools/perf/tests/parse-metric.c
+++ b/tools/perf/tests/parse-metric.c
@@ -109,6 +109,7 @@ static void load_runtime_stat(struct runtime_stat *st, struct evlist *evlist,
 	struct evsel *evsel;
 	u64 count;
 
+	perf_stat__reset_shadow_stats();
 	evlist__for_each_entry(evlist, evsel) {
 		count = find_value(evsel->name, vals);
 		perf_stat__update_shadow_stats(evsel, count, 0, st);
-- 
cgit v1.2.3


From 4747395082abc67c700a75e4cf3b796e79c7cf3a Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 18 Nov 2021 12:17:30 -0800
Subject: perf header: Fix memory leaks when processing feature headers

These leaks were found with leak sanitizer running "perf pipe recording
and injection test".

In pipe mode feat_fd may hold onto an events struct that needs freeing.

When string features are processed they may overwrite an already created
string, so free this before the overwrite.

Signed-off-by: Ian Rogers <irogers@google.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lore.kernel.org/lkml/20211118201730.2302927-1-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/header.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 79cce216727e..e3c1a532d059 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -2321,6 +2321,7 @@ out:
 #define FEAT_PROCESS_STR_FUN(__feat, __feat_env) \
 static int process_##__feat(struct feat_fd *ff, void *data __maybe_unused) \
 {\
+	free(ff->ph->env.__feat_env);		     \
 	ff->ph->env.__feat_env = do_read_string(ff); \
 	return ff->ph->env.__feat_env ? 0 : -ENOMEM; \
 }
@@ -4124,6 +4125,7 @@ int perf_event__process_feature(struct perf_session *session,
 	struct perf_record_header_feature *fe = (struct perf_record_header_feature *)event;
 	int type = fe->header.type;
 	u64 feat = fe->feat_id;
+	int ret = 0;
 
 	if (type < 0 || type >= PERF_RECORD_HEADER_MAX) {
 		pr_warning("invalid record type %d in pipe-mode\n", type);
@@ -4141,11 +4143,13 @@ int perf_event__process_feature(struct perf_session *session,
 	ff.size = event->header.size - sizeof(*fe);
 	ff.ph = &session->header;
 
-	if (feat_ops[feat].process(&ff, NULL))
-		return -1;
+	if (feat_ops[feat].process(&ff, NULL)) {
+		ret = -1;
+		goto out;
+	}
 
 	if (!feat_ops[feat].print || !tool->show_feat_hdr)
-		return 0;
+		goto out;
 
 	if (!feat_ops[feat].full_only ||
 	    tool->show_feat_hdr >= SHOW_FEAT_HEADER_FULL_INFO) {
@@ -4154,8 +4158,9 @@ int perf_event__process_feature(struct perf_session *session,
 		fprintf(stdout, "# %s info available, use -I to display\n",
 			feat_ops[feat].name);
 	}
-
-	return 0;
+out:
+	free_event_desc(ff.events);
+	return ret;
 }
 
 size_t perf_event__fprintf_event_update(union perf_event *event, FILE *fp)
-- 
cgit v1.2.3


From f7c4e85bccea960203e5872553957511df86913e Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Fri, 3 Dec 2021 19:32:34 +0000
Subject: perf bpf: Fix building perf with BUILD_BPF_SKEL=1 by default in more
 distros

Arnaldo reported that building all his containers with BUILD_BPF_SKEL=1
to then make this the default he found problems in some distros where
the system linux/bpf.h file was being used and lacked this:

   util/bpf_skel/bperf_leader.bpf.c:13:20: error: use of undeclared identifier 'BPF_F_PRESERVE_ELEMS'
           __uint(map_flags, BPF_F_PRESERVE_ELEMS);

So use instead the vmlinux.h file generated by bpftool from BTF info.

This fixed these as well, getting the build back working on debian:11,
debian:experimental and ubuntu:21.10:

  In file included from In file included from util/bpf_skel/bperf_leader.bpf.cutil/bpf_skel/bpf_prog_profiler.bpf.c::33:
  :
  In file included from In file included from /usr/include/linux/bpf.h/usr/include/linux/bpf.h::1111:
  :
  /usr/include/linux/types.h/usr/include/linux/types.h::55::1010:: In file included from  util/bpf_skel/bperf_follower.bpf.c:3fatal errorfatal error:
  : : In file included from /usr/include/linux/bpf.h:'asm/types.h' file not found11'asm/types.h' file not found:

  /usr/include/linux/types.h:5:10: fatal error: 'asm/types.h' file not found
  #include <asm/types.h>#include <asm/types.h>

           ^~~~~~~~~~~~~         ^~~~~~~~~~~~~

  #include <asm/types.h>
           ^~~~~~~~~~~~~
  1 error generated.

Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Song Liu <song@kernel.org>
Tested-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lore.kernel.org/lkml/CF175681-8101-43D1-ABDB-449E644BE986@fb.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf_skel/bperf_follower.bpf.c    | 3 +--
 tools/perf/util/bpf_skel/bperf_leader.bpf.c      | 3 +--
 tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c | 2 +-
 3 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/bpf_skel/bperf_follower.bpf.c b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
index b8fa3cb2da23..4a6acfde1493 100644
--- a/tools/perf/util/bpf_skel/bperf_follower.bpf.c
+++ b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
@@ -1,7 +1,6 @@
 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
 // Copyright (c) 2021 Facebook
-#include <linux/bpf.h>
-#include <linux/perf_event.h>
+#include "vmlinux.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include "bperf.h"
diff --git a/tools/perf/util/bpf_skel/bperf_leader.bpf.c b/tools/perf/util/bpf_skel/bperf_leader.bpf.c
index 4f70d1459e86..40d962b05863 100644
--- a/tools/perf/util/bpf_skel/bperf_leader.bpf.c
+++ b/tools/perf/util/bpf_skel/bperf_leader.bpf.c
@@ -1,7 +1,6 @@
 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
 // Copyright (c) 2021 Facebook
-#include <linux/bpf.h>
-#include <linux/perf_event.h>
+#include "vmlinux.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include "bperf.h"
diff --git a/tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c b/tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c
index ab12b4c4ece2..97037d3b3d9f 100644
--- a/tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c
+++ b/tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
 // Copyright (c) 2020 Facebook
-#include <linux/bpf.h>
+#include "vmlinux.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
-- 
cgit v1.2.3


From 5a897531e00243cebbcc4dbe4ab06cd559ccf53f Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Fri, 3 Dec 2021 15:14:41 -0800
Subject: perf bpf_skel: Do not use typedef to avoid error on old clang

When building bpf_skel with clang-10, typedef causes confusions like:

  libbpf: map 'prev_readings': unexpected def kind var.

Fix this by removing the typedef.

Fixes: 7fac83aaf2eecc9e ("perf stat: Introduce 'bperf' to share hardware PMCs with BPF")
Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lore.kernel.org/lkml/BEF5C312-4331-4A60-AEC0-AD7617CB2BC4@fb.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf_skel/bperf.h              | 14 --------------
 tools/perf/util/bpf_skel/bperf_follower.bpf.c | 16 +++++++++++++---
 tools/perf/util/bpf_skel/bperf_leader.bpf.c   | 16 +++++++++++++---
 3 files changed, 26 insertions(+), 20 deletions(-)
 delete mode 100644 tools/perf/util/bpf_skel/bperf.h

(limited to 'tools')

diff --git a/tools/perf/util/bpf_skel/bperf.h b/tools/perf/util/bpf_skel/bperf.h
deleted file mode 100644
index 186a5551ddb9..000000000000
--- a/tools/perf/util/bpf_skel/bperf.h
+++ /dev/null
@@ -1,14 +0,0 @@
-// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
-// Copyright (c) 2021 Facebook
-
-#ifndef __BPERF_STAT_H
-#define __BPERF_STAT_H
-
-typedef struct {
-	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-	__uint(key_size, sizeof(__u32));
-	__uint(value_size, sizeof(struct bpf_perf_event_value));
-	__uint(max_entries, 1);
-} reading_map;
-
-#endif /* __BPERF_STAT_H */
diff --git a/tools/perf/util/bpf_skel/bperf_follower.bpf.c b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
index 4a6acfde1493..f193998530d4 100644
--- a/tools/perf/util/bpf_skel/bperf_follower.bpf.c
+++ b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
@@ -3,11 +3,21 @@
 #include "vmlinux.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
-#include "bperf.h"
 #include "bperf_u.h"
 
-reading_map diff_readings SEC(".maps");
-reading_map accum_readings SEC(".maps");
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(struct bpf_perf_event_value));
+	__uint(max_entries, 1);
+} diff_readings SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(struct bpf_perf_event_value));
+	__uint(max_entries, 1);
+} accum_readings SEC(".maps");
 
 struct {
 	__uint(type, BPF_MAP_TYPE_HASH);
diff --git a/tools/perf/util/bpf_skel/bperf_leader.bpf.c b/tools/perf/util/bpf_skel/bperf_leader.bpf.c
index 40d962b05863..e2a2d4cd7779 100644
--- a/tools/perf/util/bpf_skel/bperf_leader.bpf.c
+++ b/tools/perf/util/bpf_skel/bperf_leader.bpf.c
@@ -3,7 +3,6 @@
 #include "vmlinux.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
-#include "bperf.h"
 
 struct {
 	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
@@ -12,8 +11,19 @@ struct {
 	__uint(map_flags, BPF_F_PRESERVE_ELEMS);
 } events SEC(".maps");
 
-reading_map prev_readings SEC(".maps");
-reading_map diff_readings SEC(".maps");
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(struct bpf_perf_event_value));
+	__uint(max_entries, 1);
+} prev_readings SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(struct bpf_perf_event_value));
+	__uint(max_entries, 1);
+} diff_readings SEC(".maps");
 
 SEC("raw_tp/sched_switch")
 int BPF_PROG(on_switch)
-- 
cgit v1.2.3


From 5cbd886ce2a9f4627ff9cdab1bc97a401a24b309 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 3 Dec 2021 14:35:33 -0800
Subject: selftests: mptcp: add TCP_INQ support

Do checks on the returned inq counter.

Fail on:
1. Huge value (> 1 kbyte, test case files are 1 kb)
2. last hint larger than returned bytes when read was short
3. erronenous indication of EOF.

3) happens when a hint of X bytes reads X-1 on next call
   but next recvmsg returns more data (instead of EOF).

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_connect.c  | 60 +++++++++++++++++++++-
 tools/testing/selftests/net/mptcp/mptcp_sockopt.sh |  4 +-
 2 files changed, 61 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c
index ada9b80774d4..98de28ac3ba8 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.c
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c
@@ -73,12 +73,20 @@ static uint32_t cfg_mark;
 struct cfg_cmsg_types {
 	unsigned int cmsg_enabled:1;
 	unsigned int timestampns:1;
+	unsigned int tcp_inq:1;
 };
 
 struct cfg_sockopt_types {
 	unsigned int transparent:1;
 };
 
+struct tcp_inq_state {
+	unsigned int last;
+	bool expect_eof;
+};
+
+static struct tcp_inq_state tcp_inq;
+
 static struct cfg_cmsg_types cfg_cmsg_types;
 static struct cfg_sockopt_types cfg_sockopt_types;
 
@@ -389,7 +397,9 @@ static size_t do_write(const int fd, char *buf, const size_t len)
 static void process_cmsg(struct msghdr *msgh)
 {
 	struct __kernel_timespec ts;
+	bool inq_found = false;
 	bool ts_found = false;
+	unsigned int inq = 0;
 	struct cmsghdr *cmsg;
 
 	for (cmsg = CMSG_FIRSTHDR(msgh); cmsg ; cmsg = CMSG_NXTHDR(msgh, cmsg)) {
@@ -398,12 +408,27 @@ static void process_cmsg(struct msghdr *msgh)
 			ts_found = true;
 			continue;
 		}
+		if (cmsg->cmsg_level == IPPROTO_TCP && cmsg->cmsg_type == TCP_CM_INQ) {
+			memcpy(&inq, CMSG_DATA(cmsg), sizeof(inq));
+			inq_found = true;
+			continue;
+		}
+
 	}
 
 	if (cfg_cmsg_types.timestampns) {
 		if (!ts_found)
 			xerror("TIMESTAMPNS not present\n");
 	}
+
+	if (cfg_cmsg_types.tcp_inq) {
+		if (!inq_found)
+			xerror("TCP_INQ not present\n");
+
+		if (inq > 1024)
+			xerror("tcp_inq %u is larger than one kbyte\n", inq);
+		tcp_inq.last = inq;
+	}
 }
 
 static ssize_t do_recvmsg_cmsg(const int fd, char *buf, const size_t len)
@@ -420,10 +445,23 @@ static ssize_t do_recvmsg_cmsg(const int fd, char *buf, const size_t len)
 		.msg_controllen = sizeof(msg_buf),
 	};
 	int flags = 0;
+	unsigned int last_hint = tcp_inq.last;
 	int ret = recvmsg(fd, &msg, flags);
 
-	if (ret <= 0)
+	if (ret <= 0) {
+		if (ret == 0 && tcp_inq.expect_eof)
+			return ret;
+
+		if (ret == 0 && cfg_cmsg_types.tcp_inq)
+			if (last_hint != 1 && last_hint != 0)
+				xerror("EOF but last tcp_inq hint was %u\n", last_hint);
+
 		return ret;
+	}
+
+	if (tcp_inq.expect_eof)
+		xerror("expected EOF, last_hint %u, now %u\n",
+		       last_hint, tcp_inq.last);
 
 	if (msg.msg_controllen && !cfg_cmsg_types.cmsg_enabled)
 		xerror("got %lu bytes of cmsg data, expected 0\n",
@@ -435,6 +473,19 @@ static ssize_t do_recvmsg_cmsg(const int fd, char *buf, const size_t len)
 	if (msg.msg_controllen)
 		process_cmsg(&msg);
 
+	if (cfg_cmsg_types.tcp_inq) {
+		if ((size_t)ret < len && last_hint > (unsigned int)ret) {
+			if (ret + 1 != (int)last_hint) {
+				int next = read(fd, msg_buf, sizeof(msg_buf));
+
+				xerror("read %u of %u, last_hint was %u tcp_inq hint now %u next_read returned %d/%m\n",
+				       ret, (unsigned int)len, last_hint, tcp_inq.last, next);
+			} else {
+				tcp_inq.expect_eof = true;
+			}
+		}
+	}
+
 	return ret;
 }
 
@@ -944,6 +995,8 @@ static void apply_cmsg_types(int fd, const struct cfg_cmsg_types *cmsg)
 
 	if (cmsg->timestampns)
 		xsetsockopt(fd, SOL_SOCKET, SO_TIMESTAMPNS_NEW, &on, sizeof(on));
+	if (cmsg->tcp_inq)
+		xsetsockopt(fd, IPPROTO_TCP, TCP_INQ, &on, sizeof(on));
 }
 
 static void parse_cmsg_types(const char *type)
@@ -965,6 +1018,11 @@ static void parse_cmsg_types(const char *type)
 		return;
 	}
 
+	if (strncmp(type, "TCPINQ", len) == 0) {
+		cfg_cmsg_types.tcp_inq = 1;
+		return;
+	}
+
 	fprintf(stderr, "Unrecognized cmsg option %s\n", type);
 	exit(1);
 }
diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh
index 41de643788b8..c8c364369599 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh
@@ -178,7 +178,7 @@ do_transfer()
 
 	timeout ${timeout_test} \
 		ip netns exec ${listener_ns} \
-			$mptcp_connect -t ${timeout_poll} -l -M 1 -p $port -s ${srv_proto} -c TIMESTAMPNS \
+			$mptcp_connect -t ${timeout_poll} -l -M 1 -p $port -s ${srv_proto} -c TIMESTAMPNS,TCPINQ \
 				${local_addr} < "$sin" > "$sout" &
 	spid=$!
 
@@ -186,7 +186,7 @@ do_transfer()
 
 	timeout ${timeout_test} \
 		ip netns exec ${connector_ns} \
-			$mptcp_connect -t ${timeout_poll} -M 2 -p $port -s ${cl_proto} -c TIMESTAMPNS \
+			$mptcp_connect -t ${timeout_poll} -M 2 -p $port -s ${cl_proto} -c TIMESTAMPNS,TCPINQ \
 				$connect_addr < "$cin" > "$cout" &
 
 	cpid=$!
-- 
cgit v1.2.3


From b51880568f20250cc62057f4b747c8212ac3e06c Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 3 Dec 2021 14:35:35 -0800
Subject: selftests: mptcp: add inq test case

client & server use a unix socket connection to communicate
outside of the mptcp connection.

This allows the consumer to know in advance how many bytes have been
(or will be) sent by the peer.
This allows stricter checks on the bytecounts reported by TCP_INQ cmsg.

Suggested-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/.gitignore       |   1 +
 tools/testing/selftests/net/mptcp/Makefile         |   2 +-
 tools/testing/selftests/net/mptcp/mptcp_inq.c      | 603 +++++++++++++++++++++
 tools/testing/selftests/net/mptcp/mptcp_sockopt.sh |  40 ++
 4 files changed, 645 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/net/mptcp/mptcp_inq.c

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/.gitignore b/tools/testing/selftests/net/mptcp/.gitignore
index 7569d892967a..49daae73c41e 100644
--- a/tools/testing/selftests/net/mptcp/.gitignore
+++ b/tools/testing/selftests/net/mptcp/.gitignore
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 mptcp_connect
+mptcp_inq
 mptcp_sockopt
 pm_nl_ctl
 *.pcap
diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile
index bbf4e448bad9..0356c4501c99 100644
--- a/tools/testing/selftests/net/mptcp/Makefile
+++ b/tools/testing/selftests/net/mptcp/Makefile
@@ -8,7 +8,7 @@ CFLAGS =  -Wall -Wl,--no-as-needed -O2 -g  -I$(top_srcdir)/usr/include
 TEST_PROGS := mptcp_connect.sh pm_netlink.sh mptcp_join.sh diag.sh \
 	      simult_flows.sh mptcp_sockopt.sh
 
-TEST_GEN_FILES = mptcp_connect pm_nl_ctl mptcp_sockopt
+TEST_GEN_FILES = mptcp_connect pm_nl_ctl mptcp_sockopt mptcp_inq
 
 TEST_FILES := settings
 
diff --git a/tools/testing/selftests/net/mptcp/mptcp_inq.c b/tools/testing/selftests/net/mptcp/mptcp_inq.c
new file mode 100644
index 000000000000..b8debd4fb5ed
--- /dev/null
+++ b/tools/testing/selftests/net/mptcp/mptcp_inq.c
@@ -0,0 +1,603 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <string.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+#include <time.h>
+
+#include <sys/ioctl.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include <netdb.h>
+#include <netinet/in.h>
+
+#include <linux/tcp.h>
+#include <linux/sockios.h>
+
+#ifndef IPPROTO_MPTCP
+#define IPPROTO_MPTCP 262
+#endif
+#ifndef SOL_MPTCP
+#define SOL_MPTCP 284
+#endif
+
+static int pf = AF_INET;
+static int proto_tx = IPPROTO_MPTCP;
+static int proto_rx = IPPROTO_MPTCP;
+
+static void die_perror(const char *msg)
+{
+	perror(msg);
+	exit(1);
+}
+
+static void die_usage(int r)
+{
+	fprintf(stderr, "Usage: mptcp_inq [-6] [ -t tcp|mptcp ] [ -r tcp|mptcp]\n");
+	exit(r);
+}
+
+static void xerror(const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	vfprintf(stderr, fmt, ap);
+	va_end(ap);
+	fputc('\n', stderr);
+	exit(1);
+}
+
+static const char *getxinfo_strerr(int err)
+{
+	if (err == EAI_SYSTEM)
+		return strerror(errno);
+
+	return gai_strerror(err);
+}
+
+static void xgetaddrinfo(const char *node, const char *service,
+			 const struct addrinfo *hints,
+			 struct addrinfo **res)
+{
+	int err = getaddrinfo(node, service, hints, res);
+
+	if (err) {
+		const char *errstr = getxinfo_strerr(err);
+
+		fprintf(stderr, "Fatal: getaddrinfo(%s:%s): %s\n",
+			node ? node : "", service ? service : "", errstr);
+		exit(1);
+	}
+}
+
+static int sock_listen_mptcp(const char * const listenaddr,
+			     const char * const port)
+{
+	int sock;
+	struct addrinfo hints = {
+		.ai_protocol = IPPROTO_TCP,
+		.ai_socktype = SOCK_STREAM,
+		.ai_flags = AI_PASSIVE | AI_NUMERICHOST
+	};
+
+	hints.ai_family = pf;
+
+	struct addrinfo *a, *addr;
+	int one = 1;
+
+	xgetaddrinfo(listenaddr, port, &hints, &addr);
+	hints.ai_family = pf;
+
+	for (a = addr; a; a = a->ai_next) {
+		sock = socket(a->ai_family, a->ai_socktype, proto_rx);
+		if (sock < 0)
+			continue;
+
+		if (-1 == setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &one,
+				     sizeof(one)))
+			perror("setsockopt");
+
+		if (bind(sock, a->ai_addr, a->ai_addrlen) == 0)
+			break; /* success */
+
+		perror("bind");
+		close(sock);
+		sock = -1;
+	}
+
+	freeaddrinfo(addr);
+
+	if (sock < 0)
+		xerror("could not create listen socket");
+
+	if (listen(sock, 20))
+		die_perror("listen");
+
+	return sock;
+}
+
+static int sock_connect_mptcp(const char * const remoteaddr,
+			      const char * const port, int proto)
+{
+	struct addrinfo hints = {
+		.ai_protocol = IPPROTO_TCP,
+		.ai_socktype = SOCK_STREAM,
+	};
+	struct addrinfo *a, *addr;
+	int sock = -1;
+
+	hints.ai_family = pf;
+
+	xgetaddrinfo(remoteaddr, port, &hints, &addr);
+	for (a = addr; a; a = a->ai_next) {
+		sock = socket(a->ai_family, a->ai_socktype, proto);
+		if (sock < 0)
+			continue;
+
+		if (connect(sock, a->ai_addr, a->ai_addrlen) == 0)
+			break; /* success */
+
+		die_perror("connect");
+	}
+
+	if (sock < 0)
+		xerror("could not create connect socket");
+
+	freeaddrinfo(addr);
+	return sock;
+}
+
+static int protostr_to_num(const char *s)
+{
+	if (strcasecmp(s, "tcp") == 0)
+		return IPPROTO_TCP;
+	if (strcasecmp(s, "mptcp") == 0)
+		return IPPROTO_MPTCP;
+
+	die_usage(1);
+	return 0;
+}
+
+static void parse_opts(int argc, char **argv)
+{
+	int c;
+
+	while ((c = getopt(argc, argv, "h6t:r:")) != -1) {
+		switch (c) {
+		case 'h':
+			die_usage(0);
+			break;
+		case '6':
+			pf = AF_INET6;
+			break;
+		case 't':
+			proto_tx = protostr_to_num(optarg);
+			break;
+		case 'r':
+			proto_rx = protostr_to_num(optarg);
+			break;
+		default:
+			die_usage(1);
+			break;
+		}
+	}
+}
+
+/* wait up to timeout milliseconds */
+static void wait_for_ack(int fd, int timeout, size_t total)
+{
+	int i;
+
+	for (i = 0; i < timeout; i++) {
+		int nsd, ret, queued = -1;
+		struct timespec req;
+
+		ret = ioctl(fd, TIOCOUTQ, &queued);
+		if (ret < 0)
+			die_perror("TIOCOUTQ");
+
+		ret = ioctl(fd, SIOCOUTQNSD, &nsd);
+		if (ret < 0)
+			die_perror("SIOCOUTQNSD");
+
+		if ((size_t)queued > total)
+			xerror("TIOCOUTQ %u, but only %zu expected\n", queued, total);
+		assert(nsd <= queued);
+
+		if (queued == 0)
+			return;
+
+		/* wait for peer to ack rx of all data */
+		req.tv_sec = 0;
+		req.tv_nsec = 1 * 1000 * 1000ul; /* 1ms */
+		nanosleep(&req, NULL);
+	}
+
+	xerror("still tx data queued after %u ms\n", timeout);
+}
+
+static void connect_one_server(int fd, int unixfd)
+{
+	size_t len, i, total, sent;
+	char buf[4096], buf2[4096];
+	ssize_t ret;
+
+	len = rand() % (sizeof(buf) - 1);
+
+	if (len < 128)
+		len = 128;
+
+	for (i = 0; i < len ; i++) {
+		buf[i] = rand() % 26;
+		buf[i] += 'A';
+	}
+
+	buf[i] = '\n';
+
+	/* un-block server */
+	ret = read(unixfd, buf2, 4);
+	assert(ret == 4);
+
+	assert(strncmp(buf2, "xmit", 4) == 0);
+
+	ret = write(unixfd, &len, sizeof(len));
+	assert(ret == (ssize_t)sizeof(len));
+
+	ret = write(fd, buf, len);
+	if (ret < 0)
+		die_perror("write");
+
+	if (ret != (ssize_t)len)
+		xerror("short write");
+
+	ret = read(unixfd, buf2, 4);
+	assert(strncmp(buf2, "huge", 4) == 0);
+
+	total = rand() % (16 * 1024 * 1024);
+	total += (1 * 1024 * 1024);
+	sent = total;
+
+	ret = write(unixfd, &total, sizeof(total));
+	assert(ret == (ssize_t)sizeof(total));
+
+	wait_for_ack(fd, 5000, len);
+
+	while (total > 0) {
+		if (total > sizeof(buf))
+			len = sizeof(buf);
+		else
+			len = total;
+
+		ret = write(fd, buf, len);
+		if (ret < 0)
+			die_perror("write");
+		total -= ret;
+
+		/* we don't have to care about buf content, only
+		 * number of total bytes sent
+		 */
+	}
+
+	ret = read(unixfd, buf2, 4);
+	assert(ret == 4);
+	assert(strncmp(buf2, "shut", 4) == 0);
+
+	wait_for_ack(fd, 5000, sent);
+
+	ret = write(fd, buf, 1);
+	assert(ret == 1);
+	close(fd);
+	ret = write(unixfd, "closed", 6);
+	assert(ret == 6);
+
+	close(unixfd);
+}
+
+static void get_tcp_inq(struct msghdr *msgh, unsigned int *inqv)
+{
+	struct cmsghdr *cmsg;
+
+	for (cmsg = CMSG_FIRSTHDR(msgh); cmsg ; cmsg = CMSG_NXTHDR(msgh, cmsg)) {
+		if (cmsg->cmsg_level == IPPROTO_TCP && cmsg->cmsg_type == TCP_CM_INQ) {
+			memcpy(inqv, CMSG_DATA(cmsg), sizeof(*inqv));
+			return;
+		}
+	}
+
+	xerror("could not find TCP_CM_INQ cmsg type");
+}
+
+static void process_one_client(int fd, int unixfd)
+{
+	unsigned int tcp_inq;
+	size_t expect_len;
+	char msg_buf[4096];
+	char buf[4096];
+	char tmp[16];
+	struct iovec iov = {
+		.iov_base = buf,
+		.iov_len = 1,
+	};
+	struct msghdr msg = {
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+		.msg_control = msg_buf,
+		.msg_controllen = sizeof(msg_buf),
+	};
+	ssize_t ret, tot;
+
+	ret = write(unixfd, "xmit", 4);
+	assert(ret == 4);
+
+	ret = read(unixfd, &expect_len, sizeof(expect_len));
+	assert(ret == (ssize_t)sizeof(expect_len));
+
+	if (expect_len > sizeof(buf))
+		xerror("expect len %zu exceeds buffer size", expect_len);
+
+	for (;;) {
+		struct timespec req;
+		unsigned int queued;
+
+		ret = ioctl(fd, FIONREAD, &queued);
+		if (ret < 0)
+			die_perror("FIONREAD");
+		if (queued > expect_len)
+			xerror("FIONREAD returned %u, but only %zu expected\n",
+			       queued, expect_len);
+		if (queued == expect_len)
+			break;
+
+		req.tv_sec = 0;
+		req.tv_nsec = 1000 * 1000ul;
+		nanosleep(&req, NULL);
+	}
+
+	/* read one byte, expect cmsg to return expected - 1 */
+	ret = recvmsg(fd, &msg, 0);
+	if (ret < 0)
+		die_perror("recvmsg");
+
+	if (msg.msg_controllen == 0)
+		xerror("msg_controllen is 0");
+
+	get_tcp_inq(&msg, &tcp_inq);
+
+	assert((size_t)tcp_inq == (expect_len - 1));
+
+	iov.iov_len = sizeof(buf);
+	ret = recvmsg(fd, &msg, 0);
+	if (ret < 0)
+		die_perror("recvmsg");
+
+	/* should have gotten exact remainder of all pending data */
+	assert(ret == (ssize_t)tcp_inq);
+
+	/* should be 0, all drained */
+	get_tcp_inq(&msg, &tcp_inq);
+	assert(tcp_inq == 0);
+
+	/* request a large swath of data. */
+	ret = write(unixfd, "huge", 4);
+	assert(ret == 4);
+
+	ret = read(unixfd, &expect_len, sizeof(expect_len));
+	assert(ret == (ssize_t)sizeof(expect_len));
+
+	/* peer should send us a few mb of data */
+	if (expect_len <= sizeof(buf))
+		xerror("expect len %zu too small\n", expect_len);
+
+	tot = 0;
+	do {
+		iov.iov_len = sizeof(buf);
+		ret = recvmsg(fd, &msg, 0);
+		if (ret < 0)
+			die_perror("recvmsg");
+
+		tot += ret;
+
+		get_tcp_inq(&msg, &tcp_inq);
+
+		if (tcp_inq > expect_len - tot)
+			xerror("inq %d, remaining %d total_len %d\n",
+			       tcp_inq, expect_len - tot, (int)expect_len);
+
+		assert(tcp_inq <= expect_len - tot);
+	} while ((size_t)tot < expect_len);
+
+	ret = write(unixfd, "shut", 4);
+	assert(ret == 4);
+
+	/* wait for hangup. Should have received one more byte of data. */
+	ret = read(unixfd, tmp, sizeof(tmp));
+	assert(ret == 6);
+	assert(strncmp(tmp, "closed", 6) == 0);
+
+	sleep(1);
+
+	iov.iov_len = 1;
+	ret = recvmsg(fd, &msg, 0);
+	if (ret < 0)
+		die_perror("recvmsg");
+	assert(ret == 1);
+
+	get_tcp_inq(&msg, &tcp_inq);
+
+	/* tcp_inq should be 1 due to received fin. */
+	assert(tcp_inq == 1);
+
+	iov.iov_len = 1;
+	ret = recvmsg(fd, &msg, 0);
+	if (ret < 0)
+		die_perror("recvmsg");
+
+	/* expect EOF */
+	assert(ret == 0);
+	get_tcp_inq(&msg, &tcp_inq);
+	assert(tcp_inq == 1);
+
+	close(fd);
+}
+
+static int xaccept(int s)
+{
+	int fd = accept(s, NULL, 0);
+
+	if (fd < 0)
+		die_perror("accept");
+
+	return fd;
+}
+
+static int server(int unixfd)
+{
+	int fd = -1, r, on = 1;
+
+	switch (pf) {
+	case AF_INET:
+		fd = sock_listen_mptcp("127.0.0.1", "15432");
+		break;
+	case AF_INET6:
+		fd = sock_listen_mptcp("::1", "15432");
+		break;
+	default:
+		xerror("Unknown pf %d\n", pf);
+		break;
+	}
+
+	r = write(unixfd, "conn", 4);
+	assert(r == 4);
+
+	alarm(15);
+	r = xaccept(fd);
+
+	if (-1 == setsockopt(r, IPPROTO_TCP, TCP_INQ, &on, sizeof(on)))
+		die_perror("setsockopt");
+
+	process_one_client(r, unixfd);
+
+	return 0;
+}
+
+static int client(int unixfd)
+{
+	int fd = -1;
+
+	alarm(15);
+
+	switch (pf) {
+	case AF_INET:
+		fd = sock_connect_mptcp("127.0.0.1", "15432", proto_tx);
+		break;
+	case AF_INET6:
+		fd = sock_connect_mptcp("::1", "15432", proto_tx);
+		break;
+	default:
+		xerror("Unknown pf %d\n", pf);
+	}
+
+	connect_one_server(fd, unixfd);
+
+	return 0;
+}
+
+static void init_rng(void)
+{
+	int fd = open("/dev/urandom", O_RDONLY);
+	unsigned int foo;
+
+	if (fd > 0) {
+		int ret = read(fd, &foo, sizeof(foo));
+
+		if (ret < 0)
+			srand(fd + foo);
+		close(fd);
+	}
+
+	srand(foo);
+}
+
+static pid_t xfork(void)
+{
+	pid_t p = fork();
+
+	if (p < 0)
+		die_perror("fork");
+	else if (p == 0)
+		init_rng();
+
+	return p;
+}
+
+static int rcheck(int wstatus, const char *what)
+{
+	if (WIFEXITED(wstatus)) {
+		if (WEXITSTATUS(wstatus) == 0)
+			return 0;
+		fprintf(stderr, "%s exited, status=%d\n", what, WEXITSTATUS(wstatus));
+		return WEXITSTATUS(wstatus);
+	} else if (WIFSIGNALED(wstatus)) {
+		xerror("%s killed by signal %d\n", what, WTERMSIG(wstatus));
+	} else if (WIFSTOPPED(wstatus)) {
+		xerror("%s stopped by signal %d\n", what, WSTOPSIG(wstatus));
+	}
+
+	return 111;
+}
+
+int main(int argc, char *argv[])
+{
+	int e1, e2, wstatus;
+	pid_t s, c, ret;
+	int unixfds[2];
+
+	parse_opts(argc, argv);
+
+	e1 = socketpair(AF_UNIX, SOCK_DGRAM, 0, unixfds);
+	if (e1 < 0)
+		die_perror("pipe");
+
+	s = xfork();
+	if (s == 0)
+		return server(unixfds[1]);
+
+	close(unixfds[1]);
+
+	/* wait until server bound a socket */
+	e1 = read(unixfds[0], &e1, 4);
+	assert(e1 == 4);
+
+	c = xfork();
+	if (c == 0)
+		return client(unixfds[0]);
+
+	close(unixfds[0]);
+
+	ret = waitpid(s, &wstatus, 0);
+	if (ret == -1)
+		die_perror("waitpid");
+	e1 = rcheck(wstatus, "server");
+	ret = waitpid(c, &wstatus, 0);
+	if (ret == -1)
+		die_perror("waitpid");
+	e2 = rcheck(wstatus, "client");
+
+	return e1 ? e1 : e2;
+}
diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh
index c8c364369599..0879da915014 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh
@@ -279,6 +279,45 @@ run_tests()
 	fi
 }
 
+do_tcpinq_test()
+{
+	ip netns exec "$ns1" ./mptcp_inq "$@"
+	lret=$?
+	if [ $lret -ne 0 ];then
+		ret=$lret
+		echo "FAIL: mptcp_inq $@" 1>&2
+		return $lret
+	fi
+
+	echo "PASS: TCP_INQ cmsg/ioctl $@"
+	return $lret
+}
+
+do_tcpinq_tests()
+{
+	local lret=0
+
+	ip netns exec "$ns1" iptables -F
+	ip netns exec "$ns1" ip6tables -F
+
+	for args in "-t tcp" "-r tcp"; do
+		do_tcpinq_test $args
+		lret=$?
+		if [ $lret -ne 0 ] ; then
+			return $lret
+		fi
+		do_tcpinq_test -6 $args
+		lret=$?
+		if [ $lret -ne 0 ] ; then
+			return $lret
+		fi
+	done
+
+	do_tcpinq_test -r tcp -t tcp
+
+	return $?
+}
+
 sin=$(mktemp)
 sout=$(mktemp)
 cin=$(mktemp)
@@ -300,4 +339,5 @@ if [ $ret -eq 0 ];then
 	echo "PASS: SOL_MPTCP getsockopt has expected information"
 fi
 
+do_tcpinq_tests
 exit $ret
-- 
cgit v1.2.3


From edb596e80cee8f9cab4c37907c8004456e5b3581 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 3 Dec 2021 14:35:38 -0800
Subject: selftests: mptcp: check IP_TOS in/out are the same

Check that getsockopt(IP_TOS) returns what setsockopt(IP_TOS) did set
right before.

Also check that socklen_t == 0 and -1 input values match those
of normal tcp sockets.

Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_sockopt.c | 63 +++++++++++++++++++++++
 1 file changed, 63 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c
index 417b11cafafe..ac9a4d9c1764 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c
+++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c
@@ -4,6 +4,7 @@
 
 #include <assert.h>
 #include <errno.h>
+#include <fcntl.h>
 #include <limits.h>
 #include <string.h>
 #include <stdarg.h>
@@ -13,6 +14,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <strings.h>
+#include <time.h>
 #include <unistd.h>
 
 #include <sys/socket.h>
@@ -594,6 +596,44 @@ static int server(int pipefd)
 	return 0;
 }
 
+static void test_ip_tos_sockopt(int fd)
+{
+	uint8_t tos_in, tos_out;
+	socklen_t s;
+	int r;
+
+	tos_in = rand() & 0xfc;
+	r = setsockopt(fd, SOL_IP, IP_TOS, &tos_in, sizeof(tos_out));
+	if (r != 0)
+		die_perror("setsockopt IP_TOS");
+
+	tos_out = 0;
+	s = sizeof(tos_out);
+	r = getsockopt(fd, SOL_IP, IP_TOS, &tos_out, &s);
+	if (r != 0)
+		die_perror("getsockopt IP_TOS");
+
+	if (tos_in != tos_out)
+		xerror("tos %x != %x socklen_t %d\n", tos_in, tos_out, s);
+
+	if (s != 1)
+		xerror("tos should be 1 byte");
+
+	s = 0;
+	r = getsockopt(fd, SOL_IP, IP_TOS, &tos_out, &s);
+	if (r != 0)
+		die_perror("getsockopt IP_TOS 0");
+	if (s != 0)
+		xerror("expect socklen_t == 0");
+
+	s = -1;
+	r = getsockopt(fd, SOL_IP, IP_TOS, &tos_out, &s);
+	if (r != -1 && errno != EINVAL)
+		die_perror("getsockopt IP_TOS did not indicate -EINVAL");
+	if (s != -1)
+		xerror("expect socklen_t == -1");
+}
+
 static int client(int pipefd)
 {
 	int fd = -1;
@@ -611,6 +651,8 @@ static int client(int pipefd)
 		xerror("Unknown pf %d\n", pf);
 	}
 
+	test_ip_tos_sockopt(fd);
+
 	connect_one_server(fd, pipefd);
 
 	return 0;
@@ -642,6 +684,25 @@ static int rcheck(int wstatus, const char *what)
 	return 111;
 }
 
+static void init_rng(void)
+{
+	int fd = open("/dev/urandom", O_RDONLY);
+
+	if (fd >= 0) {
+		unsigned int foo;
+		ssize_t ret;
+
+		/* can't fail */
+		ret = read(fd, &foo, sizeof(foo));
+		assert(ret == sizeof(foo));
+
+		close(fd);
+		srand(foo);
+	} else {
+		srand(time(NULL));
+	}
+}
+
 int main(int argc, char *argv[])
 {
 	int e1, e2, wstatus;
@@ -650,6 +711,8 @@ int main(int argc, char *argv[])
 
 	parse_opts(argc, argv);
 
+	init_rng();
+
 	e1 = pipe(pipefds);
 	if (e1 < 0)
 		die_perror("pipe");
-- 
cgit v1.2.3


From d43b75fbc23f0ac1ef9c14a5a166d3ccb761a451 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Fri, 26 Nov 2021 15:36:12 +0100
Subject: vrf: don't run conntrack on vrf with !dflt qdisc

After the below patch, the conntrack attached to skb is set to "notrack" in
the context of vrf device, for locally generated packets.
But this is true only when the default qdisc is set to the vrf device. When
changing the qdisc, notrack is not set anymore.
In fact, there is a shortcut in the vrf driver, when the default qdisc is
set, see commit dcdd43c41e60 ("net: vrf: performance improvements for
IPv4") for more details.

This patch ensures that the behavior is always the same, whatever the qdisc
is.

To demonstrate the difference, a new test is added in conntrack_vrf.sh.

Fixes: 8c9c296adfae ("vrf: run conntrack only in context of lower/physdev for locally generated packets")
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Florian Westphal <fw@strlen.de>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 drivers/net/vrf.c                                  |  8 +++---
 tools/testing/selftests/netfilter/conntrack_vrf.sh | 30 +++++++++++++++++++---
 2 files changed, 30 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index ccf677015d5b..38c2f0dbe795 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -768,8 +768,6 @@ static struct sk_buff *vrf_ip6_out_direct(struct net_device *vrf_dev,
 
 	skb->dev = vrf_dev;
 
-	vrf_nf_set_untracked(skb);
-
 	err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk,
 		      skb, NULL, vrf_dev, vrf_ip6_out_direct_finish);
 
@@ -790,6 +788,8 @@ static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev,
 	if (rt6_need_strict(&ipv6_hdr(skb)->daddr))
 		return skb;
 
+	vrf_nf_set_untracked(skb);
+
 	if (qdisc_tx_is_default(vrf_dev) ||
 	    IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED)
 		return vrf_ip6_out_direct(vrf_dev, sk, skb);
@@ -998,8 +998,6 @@ static struct sk_buff *vrf_ip_out_direct(struct net_device *vrf_dev,
 
 	skb->dev = vrf_dev;
 
-	vrf_nf_set_untracked(skb);
-
 	err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk,
 		      skb, NULL, vrf_dev, vrf_ip_out_direct_finish);
 
@@ -1021,6 +1019,8 @@ static struct sk_buff *vrf_ip_out(struct net_device *vrf_dev,
 	    ipv4_is_lbcast(ip_hdr(skb)->daddr))
 		return skb;
 
+	vrf_nf_set_untracked(skb);
+
 	if (qdisc_tx_is_default(vrf_dev) ||
 	    IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED)
 		return vrf_ip_out_direct(vrf_dev, sk, skb);
diff --git a/tools/testing/selftests/netfilter/conntrack_vrf.sh b/tools/testing/selftests/netfilter/conntrack_vrf.sh
index 91f3ef0f1192..8b5ea9234588 100755
--- a/tools/testing/selftests/netfilter/conntrack_vrf.sh
+++ b/tools/testing/selftests/netfilter/conntrack_vrf.sh
@@ -150,11 +150,27 @@ EOF
 # oifname is the vrf device.
 test_masquerade_vrf()
 {
+	local qdisc=$1
+
+	if [ "$qdisc" != "default" ]; then
+		tc -net $ns0 qdisc add dev tvrf root $qdisc
+	fi
+
 	ip netns exec $ns0 conntrack -F 2>/dev/null
 
 ip netns exec $ns0 nft -f - <<EOF
 flush ruleset
 table ip nat {
+	chain rawout {
+		type filter hook output priority raw;
+
+		oif tvrf ct state untracked counter
+	}
+	chain postrouting2 {
+		type filter hook postrouting priority mangle;
+
+		oif tvrf ct state untracked counter
+	}
 	chain postrouting {
 		type nat hook postrouting priority 0;
 		# NB: masquerade should always be combined with 'oif(name) bla',
@@ -171,13 +187,18 @@ EOF
 	fi
 
 	# must also check that nat table was evaluated on second (lower device) iteration.
-	ip netns exec $ns0 nft list table ip nat |grep -q 'counter packets 2'
+	ip netns exec $ns0 nft list table ip nat |grep -q 'counter packets 2' &&
+	ip netns exec $ns0 nft list table ip nat |grep -q 'untracked counter packets [1-9]'
 	if [ $? -eq 0 ]; then
-		echo "PASS: iperf3 connect with masquerade + sport rewrite on vrf device"
+		echo "PASS: iperf3 connect with masquerade + sport rewrite on vrf device ($qdisc qdisc)"
 	else
-		echo "FAIL: vrf masq rule has unexpected counter value"
+		echo "FAIL: vrf rules have unexpected counter value"
 		ret=1
 	fi
+
+	if [ "$qdisc" != "default" ]; then
+		tc -net $ns0 qdisc del dev tvrf root
+	fi
 }
 
 # add masq rule that gets evaluated w. outif set to veth device.
@@ -213,7 +234,8 @@ EOF
 }
 
 test_ct_zone_in
-test_masquerade_vrf
+test_masquerade_vrf "default"
+test_masquerade_vrf "pfifo"
 test_masquerade_veth
 
 exit $ret
-- 
cgit v1.2.3


From 0de53b0ffb5b22b52c1e0bd4d9e18cbbce5801d0 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Sat, 27 Nov 2021 11:33:38 +0100
Subject: selftests: netfilter: Add correctness test for mac,net set type

The existing net,mac test didn't cover the issue recently reported
by Nikita Yushchenko, where MAC addresses wouldn't match if given
as first field of a concatenated set with AVX2 and 8-bit groups,
because there's a different code path covering the lookup of six
8-bit groups (MAC addresses) if that's the first field.

Add a similar mac,net test, with MAC address and IPv4 address
swapped in the set specification.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 .../selftests/netfilter/nft_concat_range.sh        | 24 +++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/netfilter/nft_concat_range.sh b/tools/testing/selftests/netfilter/nft_concat_range.sh
index 5a4938d6dcf2..ed61f6cab60f 100755
--- a/tools/testing/selftests/netfilter/nft_concat_range.sh
+++ b/tools/testing/selftests/netfilter/nft_concat_range.sh
@@ -23,8 +23,8 @@ TESTS="reported_issues correctness concurrency timeout"
 
 # Set types, defined by TYPE_ variables below
 TYPES="net_port port_net net6_port port_proto net6_port_mac net6_port_mac_proto
-       net_port_net net_mac net_mac_icmp net6_mac_icmp net6_port_net6_port
-       net_port_mac_proto_net"
+       net_port_net net_mac mac_net net_mac_icmp net6_mac_icmp
+       net6_port_net6_port net_port_mac_proto_net"
 
 # Reported bugs, also described by TYPE_ variables below
 BUGS="flush_remove_add"
@@ -277,6 +277,23 @@ perf_entries	1000
 perf_proto	ipv4
 "
 
+TYPE_mac_net="
+display		mac,net
+type_spec	ether_addr . ipv4_addr
+chain_spec	ether saddr . ip saddr
+dst		 
+src		mac addr4
+start		1
+count		5
+src_delta	2000
+tools		sendip nc bash
+proto		udp
+
+race_repeat	0
+
+perf_duration	0
+"
+
 TYPE_net_mac_icmp="
 display		net,mac - ICMP
 type_spec	ipv4_addr . ether_addr
@@ -984,7 +1001,8 @@ format() {
 		fi
 	done
 	for f in ${src}; do
-		__expr="${__expr} . "
+		[ "${__expr}" != "{ " ] && __expr="${__expr} . "
+
 		__start="$(eval format_"${f}" "${srcstart}")"
 		__end="$(eval format_"${f}" "${srcend}")"
 
-- 
cgit v1.2.3


From d46cea0e6933da93c5373a46e3dc7e5d0e56bedb Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 3 Dec 2021 15:33:23 +0100
Subject: selftests: netfilter: switch zone stress to socat

centos9 has nmap-ncat which doesn't like the '-q' option, use socat.
While at it, mark test skipped if needed tools are missing.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 tools/testing/selftests/netfilter/nft_zones_many.sh | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/netfilter/nft_zones_many.sh b/tools/testing/selftests/netfilter/nft_zones_many.sh
index ac646376eb01..04633119b29a 100755
--- a/tools/testing/selftests/netfilter/nft_zones_many.sh
+++ b/tools/testing/selftests/netfilter/nft_zones_many.sh
@@ -18,11 +18,17 @@ cleanup()
 	ip netns del $ns
 }
 
-ip netns add $ns
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not create net namespace $gw"
-	exit $ksft_skip
-fi
+checktool (){
+	if ! $1 > /dev/null 2>&1; then
+		echo "SKIP: Could not $2"
+		exit $ksft_skip
+	fi
+}
+
+checktool "nft --version" "run test without nft tool"
+checktool "ip -Version" "run test without ip tool"
+checktool "socat -V" "run test without socat tool"
+checktool "ip netns add $ns" "create net namespace"
 
 trap cleanup EXIT
 
@@ -71,7 +77,8 @@ EOF
 		local start=$(date +%s%3N)
 		i=$((i + 10000))
 		j=$((j + 1))
-		dd if=/dev/zero of=/dev/stdout bs=8k count=10000 2>/dev/null | ip netns exec "$ns" nc -w 1 -q 1 -u -p 12345 127.0.0.1 12345 > /dev/null
+		# nft rule in output places each packet in a different zone.
+		dd if=/dev/zero of=/dev/stdout bs=8k count=10000 2>/dev/null | ip netns exec "$ns" socat STDIN UDP:127.0.0.1:12345,sourceport=12345
 		if [ $? -ne 0 ] ;then
 			ret=1
 			break
-- 
cgit v1.2.3


From 12e885433dbc2e8098cfb62c68f731b18981f3f6 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 12 Nov 2021 08:16:42 -0800
Subject: rcutorture: Test RCU-tasks multiqueue callback queueing

This commit modifies the TASKS01 scenario to use four callback queues
and the TRACE01 scenario to use two queues, thus providing testing of
multiple queues by default.

Cc: Neeraj Upadhyay <neeraj.iitr10@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/configs/rcu/TASKS01.boot | 1 +
 tools/testing/selftests/rcutorture/configs/rcu/TRACE01.boot | 1 +
 2 files changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS01.boot b/tools/testing/selftests/rcutorture/configs/rcu/TASKS01.boot
index 22cdeced98ea..30ca5b493c4b 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS01.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS01.boot
@@ -1,2 +1,3 @@
 rcutorture.torture_type=tasks
 rcutree.use_softirq=0
+rcupdate.rcu_task_enqueue_lim=4
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE01.boot b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01.boot
index 9675ad632dcc..ba6d636a4856 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TRACE01.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01.boot
@@ -1 +1,2 @@
 rcutorture.torture_type=tasks-tracing
+rcupdate.rcu_task_enqueue_lim=2
-- 
cgit v1.2.3


From 53b541fbdb9c498db112216a0cc9fe4804f54742 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 23 Nov 2021 13:51:11 -0800
Subject: rcutorture: Combine n_max_cbs from all kthreads in a callback flood

With the addition of multiple callback-flood kthreads, the maximum number
of callbacks from any one of those kthreads is reported in the rcutorture
run summary.  This commit changes this to report the sum of each kthread's
maximum number of callbacks in a given callback-flooding episode.

Cc: Neeraj Upadhyay <neeraj.iitr10@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c                                   | 9 +++++++++
 tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 1e8360d15920..33ea446101b3 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -2170,6 +2170,7 @@ struct rcu_fwd {
 static DEFINE_MUTEX(rcu_fwd_mutex);
 static struct rcu_fwd *rcu_fwds;
 static unsigned long rcu_fwd_seq;
+static atomic_long_t rcu_fwd_max_cbs;
 static bool rcu_fwd_emergency_stop;
 
 static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp)
@@ -2428,6 +2429,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
 			 n_launders + n_max_cbs - n_launders_cb_snap,
 			 n_launders, n_launders_sa,
 			 n_max_gps, n_max_cbs, cver, gps);
+		atomic_long_add(n_max_cbs, &rcu_fwd_max_cbs);
 		rcu_torture_fwd_cb_hist(rfp);
 	}
 	schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */
@@ -2489,6 +2491,8 @@ static struct notifier_block rcutorture_oom_nb = {
 /* Carry out grace-period forward-progress testing. */
 static int rcu_torture_fwd_prog(void *args)
 {
+	bool firsttime = true;
+	long max_cbs;
 	int oldnice = task_nice(current);
 	unsigned long oldseq = READ_ONCE(rcu_fwd_seq);
 	struct rcu_fwd *rfp = args;
@@ -2503,6 +2507,11 @@ static int rcu_torture_fwd_prog(void *args)
 		if (!rfp->rcu_fwd_id) {
 			schedule_timeout_interruptible(fwd_progress_holdoff * HZ);
 			WRITE_ONCE(rcu_fwd_emergency_stop, false);
+			if (!firsttime) {
+				max_cbs = atomic_long_xchg(&rcu_fwd_max_cbs, 0);
+				pr_alert("%s n_max_cbs: %ld\n", __func__, max_cbs);
+			}
+			firsttime = false;
 			WRITE_ONCE(rcu_fwd_seq, rcu_fwd_seq + 1);
 		} else {
 			while (READ_ONCE(rcu_fwd_seq) == oldseq)
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh
index fbdf162b6acd..1c4c2c727dad 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh
@@ -25,7 +25,7 @@ stopstate="`grep 'End-test grace-period state: g' $i/console.log 2> /dev/null |
 	    tail -1 | sed -e 's/^\[[ 0-9.]*] //' |
 	    awk '{ print \"[\" $1 \" \" $5 \" \" $6 \" \" $7 \"]\"; }' |
 	    tr -d '\012\015'`"
-fwdprog="`grep 'rcu_torture_fwd_prog_cr Duration' $i/console.log 2> /dev/null | sed -e 's/^\[[^]]*] //' | sort -k15nr | head -1 | awk '{ print $14 " " $15 }'`"
+fwdprog="`grep 'rcu_torture_fwd_prog n_max_cbs: ' $i/console.log 2> /dev/null | sed -e 's/^\[[^]]*] //' | sort -k3nr | head -1 | awk '{ print $2 " " $3 }'`"
 if test -z "$ngps"
 then
 	echo "$configfile ------- " $stopstate
-- 
cgit v1.2.3


From 5504f67944484495a5d8504d11fb998af05fe248 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Mon, 15 Nov 2021 12:28:23 +0100
Subject: perf test sigtrap: Add basic stress test for sigtrap handling

Add basic stress test for sigtrap handling as a perf tool built-in test.
This allows sanity checking the basic sigtrap functionality from within
the perf tool.

Committer notes:

Reported that !root was getting -EPERM, applied a fixup from Marco to
set .exclude_{hv,kernel} that made it work.

Signed-off-by: Marco Elver <elver@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Fabian Hemmer <copy@copy.sh>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: kasan-dev@googlegroups.com
Link: http://lore.kernel.org/lkml/20211115112822.4077224-1-elver@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/Build          |   1 +
 tools/perf/tests/builtin-test.c |   1 +
 tools/perf/tests/sigtrap.c      | 156 ++++++++++++++++++++++++++++++++++++++++
 tools/perf/tests/tests.h        |   1 +
 4 files changed, 159 insertions(+)
 create mode 100644 tools/perf/tests/sigtrap.c

(limited to 'tools')

diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build
index 803ca426f8e6..af2b37ef7c70 100644
--- a/tools/perf/tests/Build
+++ b/tools/perf/tests/Build
@@ -65,6 +65,7 @@ perf-y += pe-file-parsing.o
 perf-y += expand-cgroup.o
 perf-y += perf-time-to-tsc.o
 perf-y += dlfilter-test.o
+perf-y += sigtrap.o
 
 $(OUTPUT)tests/llvm-src-base.c: tests/bpf-script-example.c tests/Build
 	$(call rule_mkdir)
diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index 8cb5a1c3489e..f1e6d2a3a578 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -107,6 +107,7 @@ static struct test_suite *generic_tests[] = {
 	&suite__expand_cgroup_events,
 	&suite__perf_time_to_tsc,
 	&suite__dlfilter,
+	&suite__sigtrap,
 	NULL,
 };
 
diff --git a/tools/perf/tests/sigtrap.c b/tools/perf/tests/sigtrap.c
new file mode 100644
index 000000000000..de409f21f952
--- /dev/null
+++ b/tools/perf/tests/sigtrap.c
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Basic test for sigtrap support.
+ *
+ * Copyright (C) 2021, Google LLC.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <linux/hw_breakpoint.h>
+#include <pthread.h>
+#include <signal.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include "cloexec.h"
+#include "debug.h"
+#include "event.h"
+#include "tests.h"
+#include "../perf-sys.h"
+
+#define NUM_THREADS 5
+
+static struct {
+	int tids_want_signal;		/* Which threads still want a signal. */
+	int signal_count;		/* Sanity check number of signals received. */
+	volatile int iterate_on;	/* Variable to set breakpoint on. */
+	siginfo_t first_siginfo;	/* First observed siginfo_t. */
+} ctx;
+
+#define TEST_SIG_DATA (~(unsigned long)(&ctx.iterate_on))
+
+static struct perf_event_attr make_event_attr(void)
+{
+	struct perf_event_attr attr = {
+		.type		= PERF_TYPE_BREAKPOINT,
+		.size		= sizeof(attr),
+		.sample_period	= 1,
+		.disabled	= 1,
+		.bp_addr	= (unsigned long)&ctx.iterate_on,
+		.bp_type	= HW_BREAKPOINT_RW,
+		.bp_len		= HW_BREAKPOINT_LEN_1,
+		.inherit	= 1, /* Children inherit events ... */
+		.inherit_thread = 1, /* ... but only cloned with CLONE_THREAD. */
+		.remove_on_exec = 1, /* Required by sigtrap. */
+		.sigtrap	= 1, /* Request synchronous SIGTRAP on event. */
+		.sig_data	= TEST_SIG_DATA,
+		.exclude_kernel = 1, /* To allow */
+		.exclude_hv     = 1, /* running as !root */
+	};
+	return attr;
+}
+
+static void
+sigtrap_handler(int signum __maybe_unused, siginfo_t *info, void *ucontext __maybe_unused)
+{
+	if (!__atomic_fetch_add(&ctx.signal_count, 1, __ATOMIC_RELAXED))
+		ctx.first_siginfo = *info;
+	__atomic_fetch_sub(&ctx.tids_want_signal, syscall(SYS_gettid), __ATOMIC_RELAXED);
+}
+
+static void *test_thread(void *arg)
+{
+	pthread_barrier_t *barrier = (pthread_barrier_t *)arg;
+	pid_t tid = syscall(SYS_gettid);
+	int i;
+
+	pthread_barrier_wait(barrier);
+
+	__atomic_fetch_add(&ctx.tids_want_signal, tid, __ATOMIC_RELAXED);
+	for (i = 0; i < ctx.iterate_on - 1; i++)
+		__atomic_fetch_add(&ctx.tids_want_signal, tid, __ATOMIC_RELAXED);
+
+	return NULL;
+}
+
+static int run_test_threads(pthread_t *threads, pthread_barrier_t *barrier)
+{
+	int i;
+
+	pthread_barrier_wait(barrier);
+	for (i = 0; i < NUM_THREADS; i++)
+		TEST_ASSERT_EQUAL("pthread_join() failed", pthread_join(threads[i], NULL), 0);
+
+	return TEST_OK;
+}
+
+static int run_stress_test(int fd, pthread_t *threads, pthread_barrier_t *barrier)
+{
+	int ret;
+
+	ctx.iterate_on = 3000;
+
+	TEST_ASSERT_EQUAL("misfired signal?", ctx.signal_count, 0);
+	TEST_ASSERT_EQUAL("enable failed", ioctl(fd, PERF_EVENT_IOC_ENABLE, 0), 0);
+	ret = run_test_threads(threads, barrier);
+	TEST_ASSERT_EQUAL("disable failed", ioctl(fd, PERF_EVENT_IOC_DISABLE, 0), 0);
+
+	TEST_ASSERT_EQUAL("unexpected sigtraps", ctx.signal_count, NUM_THREADS * ctx.iterate_on);
+	TEST_ASSERT_EQUAL("missing signals or incorrectly delivered", ctx.tids_want_signal, 0);
+	TEST_ASSERT_VAL("unexpected si_addr", ctx.first_siginfo.si_addr == &ctx.iterate_on);
+#if 0 /* FIXME: enable when libc's signal.h has si_perf_{type,data} */
+	TEST_ASSERT_EQUAL("unexpected si_perf_type", ctx.first_siginfo.si_perf_type,
+			  PERF_TYPE_BREAKPOINT);
+	TEST_ASSERT_EQUAL("unexpected si_perf_data", ctx.first_siginfo.si_perf_data,
+			  TEST_SIG_DATA);
+#endif
+
+	return ret;
+}
+
+static int test__sigtrap(struct test_suite *test __maybe_unused, int subtest __maybe_unused)
+{
+	struct perf_event_attr attr = make_event_attr();
+	struct sigaction action = {};
+	struct sigaction oldact;
+	pthread_t threads[NUM_THREADS];
+	pthread_barrier_t barrier;
+	int i, fd, ret = TEST_FAIL;
+
+	pthread_barrier_init(&barrier, NULL, NUM_THREADS + 1);
+
+	action.sa_flags = SA_SIGINFO | SA_NODEFER;
+	action.sa_sigaction = sigtrap_handler;
+	sigemptyset(&action.sa_mask);
+	if (sigaction(SIGTRAP, &action, &oldact)) {
+		pr_debug("FAILED sigaction()\n");
+		goto out;
+	}
+
+	fd = sys_perf_event_open(&attr, 0, -1, -1, perf_event_open_cloexec_flag());
+	if (fd < 0) {
+		pr_debug("FAILED sys_perf_event_open()\n");
+		goto out_restore_sigaction;
+	}
+
+	for (i = 0; i < NUM_THREADS; i++) {
+		if (pthread_create(&threads[i], NULL, test_thread, &barrier)) {
+			pr_debug("FAILED pthread_create()");
+			goto out_close_perf_event;
+		}
+	}
+
+	ret = run_stress_test(fd, threads, &barrier);
+
+out_close_perf_event:
+	close(fd);
+out_restore_sigaction:
+	sigaction(SIGTRAP, &oldact, NULL);
+out:
+	pthread_barrier_destroy(&barrier);
+	return ret;
+}
+
+DEFINE_SUITE("Sigtrap", sigtrap);
diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h
index 8f65098110fc..5bbb8f6a48fc 100644
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h
@@ -146,6 +146,7 @@ DECLARE_SUITE(pe_file_parsing);
 DECLARE_SUITE(expand_cgroup_events);
 DECLARE_SUITE(perf_time_to_tsc);
 DECLARE_SUITE(dlfilter);
+DECLARE_SUITE(sigtrap);
 
 /*
  * PowerPC and S390 do not support creation of instruction breakpoints using the
-- 
cgit v1.2.3


From e9c08f722924c58041d2e0d90ea27140a4625776 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 16 Nov 2021 09:51:48 -0300
Subject: perf test sigtrap: Print errno string when failing

Helps a bit the user figuring out why it is failing:

Before:

  $ perf test sigtrap
  73: Sigtrap                                                         : FAILED!
  $ perf test -v sigtrap
  73: Sigtrap                                                         :
  --- start ---
  test child forked, pid 3816772
  FAILED sys_perf_event_open()
  test child finished with -1
  ---- end ----
  Sigtrap: FAILED!
  $

After:

  $ perf test sigtrap
  73: Sigtrap                                                         : FAILED!
  $ perf test -v sigtrap
  73: Sigtrap                                                         :
  --- start ---
  test child forked, pid 3816772
  FAILED sys_perf_event_open(): Permission denied
  test child finished with -1
  ---- end ----
  Sigtrap: FAILED!
  $

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Fabian Hemmer <copy@copy.sh>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Marco Elver <elver@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: kasan-dev@googlegroups.com
Link: http://lore.kernel.org/lkml/YZOpSVOCXe0zWeRs@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/sigtrap.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/sigtrap.c b/tools/perf/tests/sigtrap.c
index de409f21f952..1004bf0e7cc9 100644
--- a/tools/perf/tests/sigtrap.c
+++ b/tools/perf/tests/sigtrap.c
@@ -5,9 +5,11 @@
  * Copyright (C) 2021, Google LLC.
  */
 
+#include <errno.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <linux/hw_breakpoint.h>
+#include <linux/string.h>
 #include <pthread.h>
 #include <signal.h>
 #include <sys/ioctl.h>
@@ -117,6 +119,7 @@ static int test__sigtrap(struct test_suite *test __maybe_unused, int subtest __m
 	struct sigaction oldact;
 	pthread_t threads[NUM_THREADS];
 	pthread_barrier_t barrier;
+	char sbuf[STRERR_BUFSIZE];
 	int i, fd, ret = TEST_FAIL;
 
 	pthread_barrier_init(&barrier, NULL, NUM_THREADS + 1);
@@ -125,19 +128,19 @@ static int test__sigtrap(struct test_suite *test __maybe_unused, int subtest __m
 	action.sa_sigaction = sigtrap_handler;
 	sigemptyset(&action.sa_mask);
 	if (sigaction(SIGTRAP, &action, &oldact)) {
-		pr_debug("FAILED sigaction()\n");
+		pr_debug("FAILED sigaction(): %s\n", str_error_r(errno, sbuf, sizeof(sbuf)));
 		goto out;
 	}
 
 	fd = sys_perf_event_open(&attr, 0, -1, -1, perf_event_open_cloexec_flag());
 	if (fd < 0) {
-		pr_debug("FAILED sys_perf_event_open()\n");
+		pr_debug("FAILED sys_perf_event_open(): %s\n", str_error_r(errno, sbuf, sizeof(sbuf)));
 		goto out_restore_sigaction;
 	}
 
 	for (i = 0; i < NUM_THREADS; i++) {
 		if (pthread_create(&threads[i], NULL, test_thread, &barrier)) {
-			pr_debug("FAILED pthread_create()");
+			pr_debug("FAILED pthread_create(): %s\n", str_error_r(errno, sbuf, sizeof(sbuf)));
 			goto out_close_perf_event;
 		}
 	}
-- 
cgit v1.2.3


From c77a78c29177f9a614915e5158a7b6bb89e0e8db Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Sat, 30 Oct 2021 00:30:41 +0800
Subject: tools build: Enable warnings through HOSTCFLAGS

The tools build system uses KBUILD_HOSTCFLAGS symbol for obvious purposes.

However this is not set for anything under tools/

As such, host tools apps built have no compiler warnings enabled.

Declare HOSTCFLAGS for perf tools build, and also use that symbol in
declaration of host_c_flags. HOSTCFLAGS comes from EXTRA_WARNINGS, which
is independent of target platform/arch warning flags.

Suggested-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: John Garry <john.garry@huawei.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Laura Abbott <labbott@kernel.org>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/1635525041-151876-1-git-send-email-john.garry@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/build/Build.include  | 2 +-
 tools/perf/Makefile.config | 5 +++++
 tools/perf/Makefile.perf   | 2 +-
 3 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/build/Build.include b/tools/build/Build.include
index 2cf3b1bde86e..c2a95ab47379 100644
--- a/tools/build/Build.include
+++ b/tools/build/Build.include
@@ -99,7 +99,7 @@ cxx_flags = -Wp,-MD,$(depfile) -Wp,-MT,$@ $(CXXFLAGS) -D"BUILD_STR(s)=\#s" $(CXX
 ###
 ## HOSTCC C flags
 
-host_c_flags = -Wp,-MD,$(depfile) -Wp,-MT,$@ $(KBUILD_HOSTCFLAGS) -D"BUILD_STR(s)=\#s" $(HOSTCFLAGS_$(basetarget).o) $(HOSTCFLAGS_$(obj))
+host_c_flags = -Wp,-MD,$(depfile) -Wp,-MT,$@ $(HOSTCFLAGS) -D"BUILD_STR(s)=\#s" $(HOSTCFLAGS_$(basetarget).o) $(HOSTCFLAGS_$(obj))
 
 # output directory for tests below
 TMPOUT = .tmp_$$$$
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 3df74cf5651a..94bb53b0cebd 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -17,6 +17,7 @@ detected     = $(shell echo "$(1)=y"       >> $(OUTPUT).config-detected)
 detected_var = $(shell echo "$(1)=$($(1))" >> $(OUTPUT).config-detected)
 
 CFLAGS := $(EXTRA_CFLAGS) $(filter-out -Wnested-externs,$(EXTRA_WARNINGS))
+HOSTCFLAGS := $(filter-out -Wnested-externs,$(EXTRA_WARNINGS))
 
 include $(srctree)/tools/scripts/Makefile.arch
 
@@ -211,6 +212,7 @@ endif
 ifneq ($(WERROR),0)
   CORE_CFLAGS += -Werror
   CXXFLAGS += -Werror
+  HOSTCFLAGS += -Werror
 endif
 
 ifndef DEBUG
@@ -290,6 +292,9 @@ CXXFLAGS += -ggdb3
 CXXFLAGS += -funwind-tables
 CXXFLAGS += -Wno-strict-aliasing
 
+HOSTCFLAGS += -Wall
+HOSTCFLAGS += -Wextra
+
 # Enforce a non-executable stack, as we may regress (again) in the future by
 # adding assembler files missing the .GNU-stack linker note.
 LDFLAGS += -Wl,-z,noexecstack
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 80522bcfafe0..164a37523781 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -226,7 +226,7 @@ else
 endif
 
 export srctree OUTPUT RM CC CXX LD AR CFLAGS CXXFLAGS V BISON FLEX AWK
-export HOSTCC HOSTLD HOSTAR
+export HOSTCC HOSTLD HOSTAR HOSTCFLAGS
 
 include $(srctree)/tools/build/Makefile.include
 
-- 
cgit v1.2.3


From 9a5b2d1afa9f888335ab63e922ba5eed31383020 Mon Sep 17 00:00:00 2001
From: Shunsuke Nakamura <nakamura.shun@fujitsu.com>
Date: Tue, 9 Nov 2021 17:58:29 +0900
Subject: libperf: Adopt perf_counts_values__scale() from tools/perf/util

Move perf_counts_values__scale() from tools/perf/util to tools/lib/perf
so that it can be used with libperf.

Committer notes:

As noted by Jiri, use __s8 instead of s8 on the exported function.

Signed-off-by: Shunsuke Nakamura <nakamura.shun@fujitsu.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20211109085831.3770594-2-nakamura.shun@fujitsu.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/perf/evsel.c              | 19 +++++++++++++++++++
 tools/lib/perf/include/perf/evsel.h |  4 ++++
 tools/lib/perf/libperf.map          |  1 +
 tools/perf/util/evsel.c             | 19 -------------------
 tools/perf/util/evsel.h             |  3 ---
 5 files changed, 24 insertions(+), 22 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/perf/evsel.c b/tools/lib/perf/evsel.c
index 8441e3e1aaac..68f83d2c27c1 100644
--- a/tools/lib/perf/evsel.c
+++ b/tools/lib/perf/evsel.c
@@ -431,3 +431,22 @@ void perf_evsel__free_id(struct perf_evsel *evsel)
 	zfree(&evsel->id);
 	evsel->ids = 0;
 }
+
+void perf_counts_values__scale(struct perf_counts_values *count,
+			       bool scale, __s8 *pscaled)
+{
+	s8 scaled = 0;
+
+	if (scale) {
+		if (count->run == 0) {
+			scaled = -1;
+			count->val = 0;
+		} else if (count->run < count->ena) {
+			scaled = 1;
+			count->val = (u64)((double)count->val * count->ena / count->run);
+		}
+	}
+
+	if (pscaled)
+		*pscaled = scaled;
+}
diff --git a/tools/lib/perf/include/perf/evsel.h b/tools/lib/perf/include/perf/evsel.h
index 60eae25076d3..f401c7484bec 100644
--- a/tools/lib/perf/include/perf/evsel.h
+++ b/tools/lib/perf/include/perf/evsel.h
@@ -4,6 +4,8 @@
 
 #include <stdint.h>
 #include <perf/core.h>
+#include <stdbool.h>
+#include <linux/types.h>
 
 struct perf_evsel;
 struct perf_event_attr;
@@ -39,5 +41,7 @@ LIBPERF_API int perf_evsel__disable_cpu(struct perf_evsel *evsel, int cpu);
 LIBPERF_API struct perf_cpu_map *perf_evsel__cpus(struct perf_evsel *evsel);
 LIBPERF_API struct perf_thread_map *perf_evsel__threads(struct perf_evsel *evsel);
 LIBPERF_API struct perf_event_attr *perf_evsel__attr(struct perf_evsel *evsel);
+LIBPERF_API void perf_counts_values__scale(struct perf_counts_values *count,
+					   bool scale, __s8 *pscaled);
 
 #endif /* __LIBPERF_EVSEL_H */
diff --git a/tools/lib/perf/libperf.map b/tools/lib/perf/libperf.map
index 71468606e8a7..5979bf92d98f 100644
--- a/tools/lib/perf/libperf.map
+++ b/tools/lib/perf/libperf.map
@@ -50,6 +50,7 @@ LIBPERF_0.0.1 {
 		perf_mmap__read_init;
 		perf_mmap__read_done;
 		perf_mmap__read_event;
+		perf_counts_values__scale;
 	local:
 		*;
 };
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index ac0127be0459..656c30b988ce 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1476,25 +1476,6 @@ void evsel__compute_deltas(struct evsel *evsel, int cpu, int thread,
 	count->run = count->run - tmp.run;
 }
 
-void perf_counts_values__scale(struct perf_counts_values *count,
-			       bool scale, s8 *pscaled)
-{
-	s8 scaled = 0;
-
-	if (scale) {
-		if (count->run == 0) {
-			scaled = -1;
-			count->val = 0;
-		} else if (count->run < count->ena) {
-			scaled = 1;
-			count->val = (u64)((double) count->val * count->ena / count->run);
-		}
-	}
-
-	if (pscaled)
-		*pscaled = scaled;
-}
-
 static int evsel__read_one(struct evsel *evsel, int cpu, int thread)
 {
 	struct perf_counts_values *count = perf_counts(evsel->counts, cpu, thread);
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 29d49a8c1e92..99aa3363def7 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -195,9 +195,6 @@ static inline int evsel__nr_cpus(struct evsel *evsel)
 	return evsel__cpus(evsel)->nr;
 }
 
-void perf_counts_values__scale(struct perf_counts_values *count,
-			       bool scale, s8 *pscaled);
-
 void evsel__compute_deltas(struct evsel *evsel, int cpu, int thread,
 			   struct perf_counts_values *count);
 
-- 
cgit v1.2.3


From f2c4dcf191904d28d710290eea4a623710eee57c Mon Sep 17 00:00:00 2001
From: Shunsuke Nakamura <nakamura.shun@fujitsu.com>
Date: Tue, 9 Nov 2021 17:58:30 +0900
Subject: libperf: Remove scaling process from perf_mmap__read_self()

Remove the scaling process from perf_mmap__read_self(), and unify the
counters that can be obtained from perf_evsel__read() to "no scaling".

Signed-off-by: Shunsuke Nakamura <nakamura.shun@fujitsu.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20211109085831.3770594-3-nakamura.shun@fujitsu.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/perf/mmap.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/perf/mmap.c b/tools/lib/perf/mmap.c
index c89dfa5f67b3..aaa457904008 100644
--- a/tools/lib/perf/mmap.c
+++ b/tools/lib/perf/mmap.c
@@ -353,8 +353,6 @@ int perf_mmap__read_self(struct perf_mmap *map, struct perf_counts_values *count
 		count->ena += delta;
 		if (idx)
 			count->run += delta;
-
-		cnt = mul_u64_u64_div64(cnt, count->ena, count->run);
 	}
 
 	count->val = cnt;
-- 
cgit v1.2.3


From a7f3713f6bf207e6d8dd484704dba6089f7ad8db Mon Sep 17 00:00:00 2001
From: Shunsuke Nakamura <nakamura.shun@fujitsu.com>
Date: Tue, 9 Nov 2021 17:58:31 +0900
Subject: libperf tests: Add test_stat_multiplexing test

Adds a test for a counter obtained using read() system call during
multiplexing.

  $ sudo make tests -C ./tools/lib/perf/ V=1
  make: Entering directory '/home/nakamura/build_work/build_kernel/linux_kernel/linux/tools/lib/perf'
  make -f /home/nakamura/build_work/build_kernel/linux_kernel/linux/tools/build/Makefile.build dir=. obj=libperf
  make -C /home/nakamura/build_work/build_kernel/linux_kernel/linux/tools/lib/api/ O= libapi.a
  make -f /home/nakamura/build_work/build_kernel/linux_kernel/linux/tools/build/Makefile.build dir=./fd obj=libapi
  make -f /home/nakamura/build_work/build_kernel/linux_kernel/linux/tools/build/Makefile.build dir=./fs obj=libapi
  make -f /home/nakamura/build_work/build_kernel/linux_kernel/linux/tools/build/Makefile.build dir=. obj=tests
  make -f /home/nakamura/build_work/build_kernel/linux_kernel/linux/tools/build/Makefile.build dir=./tests obj=tests
  running static:
  - running tests/test-cpumap.c...OK
  - running tests/test-threadmap.c...OK
  - running tests/test-evlist.c...
  Event  0 -- Raw count = 298049842, run = 270269503, enable = 456262127
           Scaled count = 503160191 (59.24%, 270269503/456262127)
  Event  1 -- Raw count = 299134173, run = 271075173, enable = 456257234
           Scaled count = 503484435 (59.41%, 271075173/456257234)
  Event  2 -- Raw count = 300461996, run = 272069283, enable = 456253417
           Scaled count = 503867290 (59.63%, 272069283/456253417)
  Event  3 -- Raw count = 301308704, run = 273063387, enable = 456249352
           Scaled count = 503443183 (59.85%, 273063387/456249352)
  Event  4 -- Raw count = 302531164, run = 274102932, enable = 456244712
           Scaled count = 503563543 (60.08%, 274102932/456244712)
  Event  5 -- Raw count = 303710254, run = 275406214, enable = 456228165
           Scaled count = 503115633 (60.37%, 275406214/456228165)
  Event  6 -- Raw count = 304531302, run = 276396076, enable = 456221130
           Scaled count = 502661313 (60.58%, 276396076/456221130)
  Event  7 -- Raw count = 304486460, run = 276601890, enable = 456213754
           Scaled count = 502205212 (60.63%, 276601890/456213754)
  Event  8 -- Raw count = 304116681, run = 276631326, enable = 456205562
           Scaled count = 501532936 (60.64%, 276631326/456205562)
  Event  9 -- Raw count = 303567766, run = 276188567, enable = 456196839
           Scaled count = 501420666 (60.54%, 276188567/456196839)
  Event 10 -- Raw count = 302238014, run = 275144001, enable = 456185300
           Scaled count = 501106833 (60.31%, 275144001/456185300)
  Event 11 -- Raw count = 300805716, run = 273824589, enable = 456175608
           Scaled count = 501124573 (60.03%, 273824589/456175608)
  Event 12 -- Raw count = 299959051, run = 272834556, enable = 456166593
           Scaled count = 501517477 (59.81%, 272834556/456166593)
  Event 13 -- Raw count = 299037090, run = 271820805, enable = 456157086
           Scaled count = 501830195 (59.59%, 271820805/456157086)
  Event 14 -- Raw count = 298327042, run = 270784311, enable = 456147546
           Scaled count = 502544433 (59.36%, 270784311/456147546)
     Expected: 501614268
     High: 503867290   Low:  298049842   Average:  502438527
     Average Error = 0.16%
  OK
  - running tests/test-evsel.c...
          loop = 65536, count = 328182
          loop = 131072, count = 660214
          loop = 262144, count = 1315534
          loop = 524288, count = 2635364
          loop = 1048576, count = 5271971
          loop = 65536, count = 491952
          loop = 131072, count = 850061
          loop = 262144, count = 1648608
          loop = 524288, count = 3162059
          loop = 1048576, count = 6353393
  OK
  running dynamic:
  - running tests/test-cpumap.c...OK
  - running tests/test-threadmap.c...OK
  - running tests/test-evlist.c...
  Event  0 -- Raw count = 300218292, run = 297528154, enable = 496789343
           Scaled count = 501281125 (59.89%, 297528154/496789343)
  Event  1 -- Raw count = 301438606, run = 298515328, enable = 496784768
           Scaled count = 501649643 (60.09%, 298515328/496784768)
  Event  2 -- Raw count = 302342618, run = 298798983, enable = 496782015
           Scaled count = 502673648 (60.15%, 298798983/496782015)
  Event  3 -- Raw count = 303132319, run = 299230407, enable = 496778508
           Scaled count = 503256412 (60.23%, 299230407/496778508)
  Event  4 -- Raw count = 302758195, run = 299218047, enable = 496774243
           Scaled count = 502651743 (60.23%, 299218047/496774243)
  Event  5 -- Raw count = 303158458, run = 299204274, enable = 496769146
           Scaled count = 503334281 (60.23%, 299204274/496769146)
  Event  6 -- Raw count = 303471397, run = 299197479, enable = 496763124
           Scaled count = 503859189 (60.23%, 299197479/496763124)
  Event  7 -- Raw count = 303583387, run = 299196861, enable = 496756458
           Scaled count = 504039405 (60.23%, 299196861/496756458)
  Event  8 -- Raw count = 303096897, run = 299186924, enable = 496748667
           Scaled count = 503240507 (60.23%, 299186924/496748667)
  Event  9 -- Raw count = 301424173, run = 297845086, enable = 496739994
           Scaled count = 502709122 (59.96%, 297845086/496739994)
  Event 10 -- Raw count = 300876415, run = 296851339, enable = 496729034
           Scaled count = 503464297 (59.76%, 296851339/496729034)
  Event 11 -- Raw count = 300239338, run = 296547963, enable = 496719538
           Scaled count = 502902612 (59.70%, 296547963/496719538)
  Event 12 -- Raw count = 299751948, run = 296547195, enable = 496710036
           Scaled count = 502077926 (59.70%, 296547195/496710036)
  Event 13 -- Raw count = 299341883, run = 296549981, enable = 496700423
           Scaled count = 501376663 (59.70%, 296549981/496700423)
  Event 14 -- Raw count = 299145476, run = 296561684, enable = 496690949
           Scaled count = 501018366 (59.71%, 296561684/496690949)
     Expected: 501669431
     High: 504039405   Low:  300218292   Average:  502635662
     Average Error = 0.19%
  OK
  - running tests/test-evsel.c...
          loop = 65536, count = 329275
          loop = 131072, count = 664638
          loop = 262144, count = 1315367
          loop = 524288, count = 2629617
          loop = 1048576, count = 5273657
          loop = 65536, count = 459641
          loop = 131072, count = 978402
          loop = 262144, count = 1581219
          loop = 524288, count = 3774908
          loop = 1048576, count = 7694417
  OK
  make: Leaving directory '/home/nakamura/build_work/build_kernel/linux_kernel/linux/tools/lib/perf'

Signed-off-by: Shunsuke Nakamura <nakamura.shun@fujitsu.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20211109085831.3770594-4-nakamura.shun@fujitsu.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/perf/tests/test-evlist.c | 157 +++++++++++++++++++++++++++++++++++++
 1 file changed, 157 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/perf/tests/test-evlist.c b/tools/lib/perf/tests/test-evlist.c
index ce91a582f0e4..520a78267743 100644
--- a/tools/lib/perf/tests/test-evlist.c
+++ b/tools/lib/perf/tests/test-evlist.c
@@ -21,6 +21,9 @@
 #include "tests.h"
 #include <internal/evsel.h>
 
+#define EVENT_NUM 15
+#define WAIT_COUNT 100000000UL
+
 static int libperf_print(enum libperf_print_level level,
 			 const char *fmt, va_list ap)
 {
@@ -413,6 +416,159 @@ static int test_mmap_cpus(void)
 	return 0;
 }
 
+static double display_error(long long average,
+			    long long high,
+			    long long low,
+			    long long expected)
+{
+	double error;
+
+	error = (((double)average - expected) / expected) * 100.0;
+
+	__T_VERBOSE("   Expected: %lld\n", expected);
+	__T_VERBOSE("   High: %lld   Low:  %lld   Average:  %lld\n",
+		    high, low, average);
+
+	__T_VERBOSE("   Average Error = %.2f%%\n", error);
+
+	return error;
+}
+
+static int test_stat_multiplexing(void)
+{
+	struct perf_counts_values expected_counts = { .val = 0 };
+	struct perf_counts_values counts[EVENT_NUM] = {{ .val = 0 },};
+	struct perf_thread_map *threads;
+	struct perf_evlist *evlist;
+	struct perf_evsel *evsel;
+	struct perf_event_attr attr = {
+		.type	     = PERF_TYPE_HARDWARE,
+		.config	     = PERF_COUNT_HW_INSTRUCTIONS,
+		.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
+			       PERF_FORMAT_TOTAL_TIME_RUNNING,
+		.disabled    = 1,
+	};
+	int err, i, nonzero = 0;
+	unsigned long count;
+	long long max = 0, min = 0, avg = 0;
+	double error = 0.0;
+	s8 scaled = 0;
+
+	/* read for non-multiplexing event count */
+	threads = perf_thread_map__new_dummy();
+	__T("failed to create threads", threads);
+
+	perf_thread_map__set_pid(threads, 0, 0);
+
+	evsel = perf_evsel__new(&attr);
+	__T("failed to create evsel", evsel);
+
+	err = perf_evsel__open(evsel, NULL, threads);
+	__T("failed to open evsel", err == 0);
+
+	err = perf_evsel__enable(evsel);
+	__T("failed to enable evsel", err == 0);
+
+	/* wait loop */
+	count = WAIT_COUNT;
+	while (count--)
+		;
+
+	perf_evsel__read(evsel, 0, 0, &expected_counts);
+	__T("failed to read value for evsel", expected_counts.val != 0);
+	__T("failed to read non-multiplexing event count",
+	    expected_counts.ena == expected_counts.run);
+
+	err = perf_evsel__disable(evsel);
+	__T("failed to enable evsel", err == 0);
+
+	perf_evsel__close(evsel);
+	perf_evsel__delete(evsel);
+
+	perf_thread_map__put(threads);
+
+	/* read for multiplexing event count */
+	threads = perf_thread_map__new_dummy();
+	__T("failed to create threads", threads);
+
+	perf_thread_map__set_pid(threads, 0, 0);
+
+	evlist = perf_evlist__new();
+	__T("failed to create evlist", evlist);
+
+	for (i = 0; i < EVENT_NUM; i++) {
+		evsel = perf_evsel__new(&attr);
+		__T("failed to create evsel", evsel);
+
+		perf_evlist__add(evlist, evsel);
+	}
+	perf_evlist__set_maps(evlist, NULL, threads);
+
+	err = perf_evlist__open(evlist);
+	__T("failed to open evsel", err == 0);
+
+	perf_evlist__enable(evlist);
+
+	/* wait loop */
+	count = WAIT_COUNT;
+	while (count--)
+		;
+
+	i = 0;
+	perf_evlist__for_each_evsel(evlist, evsel) {
+		perf_evsel__read(evsel, 0, 0, &counts[i]);
+		__T("failed to read value for evsel", counts[i].val != 0);
+		i++;
+	}
+
+	perf_evlist__disable(evlist);
+
+	min = counts[0].val;
+	for (i = 0; i < EVENT_NUM; i++) {
+		__T_VERBOSE("Event %2d -- Raw count = %lu, run = %lu, enable = %lu\n",
+			    i, counts[i].val, counts[i].run, counts[i].ena);
+
+		perf_counts_values__scale(&counts[i], true, &scaled);
+		if (scaled == 1) {
+			__T_VERBOSE("\t Scaled count = %lu (%.2lf%%, %lu/%lu)\n",
+				    counts[i].val,
+				    (double)counts[i].run / (double)counts[i].ena * 100.0,
+				    counts[i].run, counts[i].ena);
+		} else if (scaled == -1) {
+			__T_VERBOSE("\t Not Runnnig\n");
+		} else {
+			__T_VERBOSE("\t Not Scaling\n");
+		}
+
+		if (counts[i].val > max)
+			max = counts[i].val;
+
+		if (counts[i].val < min)
+			min = counts[i].val;
+
+		avg += counts[i].val;
+
+		if (counts[i].val != 0)
+			nonzero++;
+	}
+
+	if (nonzero != 0)
+		avg = avg / nonzero;
+	else
+		avg = 0;
+
+	error = display_error(avg, max, min, expected_counts.val);
+
+	__T("Error out of range!", ((error <= 1.0) && (error >= -1.0)));
+
+	perf_evlist__close(evlist);
+	perf_evlist__delete(evlist);
+
+	perf_thread_map__put(threads);
+
+	return 0;
+}
+
 int test_evlist(int argc, char **argv)
 {
 	__T_START;
@@ -424,6 +580,7 @@ int test_evlist(int argc, char **argv)
 	test_stat_thread_enable();
 	test_mmap_thread();
 	test_mmap_cpus();
+	test_stat_multiplexing();
 
 	__T_END;
 	return tests_failed == 0 ? 0 : -1;
-- 
cgit v1.2.3


From 4edb117e6472ca0c0730887dba8b30cba0a3705e Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan.das@amd.com>
Date: Tue, 23 Nov 2021 14:16:12 +0530
Subject: perf docs: Add info on AMD raw event encoding

AMD processors have events with event select codes and unit masks larger
than a byte. The core PMU, for example, uses 12-bit event select codes
split between bits 0-7 and 32-35 of the PERF_CTL MSRs as can be seen
from /sys/bus/event_sources/devices/cpu/format/*.

The Processor Programming Reference (PPR) lists the event codes as
unified 12-bit hexadecimal values instead and the split between the bits
is not apparent to someone who is not aware of the layout of the
PERF_CTL MSRs.

8-bit event select codes continue to work as the layout matches that of
the PERF_CTL MSRs i.e. bits 0-7 for event select and 8-15 for unit mask.

This adds more details in the perf man pages about using
/sys/bus/event_sources/devices/*/format/* for determining the correct
raw event encoding scheme.

E.g. the "op_cache_hit_miss.op_cache_hit" event with code 0x28f and
umask 0x03 can be programmed using its symbolic name as:

  $ sudo perf --debug perf-event-open stat -e op_cache_hit_miss.op_cache_hit sleep 1
  ------------------------------------------------------------
  perf_event_attr:
    type                             4
    size                             128
    config                           0x20000038f
    sample_type                      IDENTIFIER
    read_format                      TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING
    disabled                         1
    inherit                          1
    enable_on_exec                   1
    exclude_guest                    1
  ------------------------------------------------------------
  [...]

One might use a simple eventsel+umask combination based on what the
current man pages say and incorrectly program the event as:

  $ sudo perf --debug perf-event-open stat -e r0328f sleep 1
  ------------------------------------------------------------
  perf_event_attr:
    type                             4
    size                             128
    config                           0x328f
    sample_type                      IDENTIFIER
    read_format                      TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING
    disabled                         1
    inherit                          1
    enable_on_exec                   1
    exclude_guest                    1
  ------------------------------------------------------------
  [...]

When it should have been based on the format from sysfs:

  $ cat /sys/bus/event_source/devices/cpu/format/event
  config:0-7,32-35

  $ sudo perf --debug perf-event-open stat -e r20000038f sleep 1
  ------------------------------------------------------------
  perf_event_attr:
    type                             4
    size                             128
    config                           0x20000038f
    sample_type                      IDENTIFIER
    read_format                      TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING
    disabled                         1
    inherit                          1
    enable_on_exec                   1
    exclude_guest                    1
  ------------------------------------------------------------
  [...]

Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
Signed-off-by: Sandipan Das <sandipan.das@amd.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Cc: Ananth Narayan <ananth.narayan@amd.com>
Cc: Kim Phillips <kim.phillips@amd.com>
Cc: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: Robert Richter <rrichter@amd.com>
Cc: Santosh Shukla <santosh.shukla@amd.com>
Link: https://lore.kernel.org/r/20211123084613.243792-1-sandipan.das@amd.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-list.txt   | 34 +++++++++++++++++++++++++++++++-
 tools/perf/Documentation/perf-record.txt |  6 ++++--
 tools/perf/Documentation/perf-stat.txt   |  6 ++++--
 tools/perf/Documentation/perf-top.txt    |  7 ++++---
 4 files changed, 45 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt
index 4dc8d0af19df..a922a95289a9 100644
--- a/tools/perf/Documentation/perf-list.txt
+++ b/tools/perf/Documentation/perf-list.txt
@@ -94,7 +94,7 @@ RAW HARDWARE EVENT DESCRIPTOR
 Even when an event is not available in a symbolic form within perf right now,
 it can be encoded in a per processor specific way.
 
-For instance For x86 CPUs NNN represents the raw register encoding with the
+For instance on x86 CPUs, N is a hexadecimal value that represents the raw register encoding with the
 layout of IA32_PERFEVTSELx MSRs (see [Intel® 64 and IA-32 Architectures Software Developer's Manual Volume 3B: System Programming Guide] Figure 30-1 Layout
 of IA32_PERFEVTSELx MSRs) or AMD's PerfEvtSeln (see [AMD64 Architecture Programmer’s Manual Volume 2: System Programming], Page 344,
 Figure 13-7 Performance Event-Select Register (PerfEvtSeln)).
@@ -126,6 +126,38 @@ It's also possible to use pmu syntax:
  perf record -e cpu/r1a8/ ...
  perf record -e cpu/r0x1a8/ ...
 
+Some processors, like those from AMD, support event codes and unit masks
+larger than a byte. In such cases, the bits corresponding to the event
+configuration parameters can be seen with:
+
+  cat /sys/bus/event_source/devices/<pmu>/format/<config>
+
+Example:
+
+If the AMD docs for an EPYC 7713 processor describe an event as:
+
+  Event  Umask  Event Mask
+  Num.   Value  Mnemonic                        Description
+
+  28FH     03H  op_cache_hit_miss.op_cache_hit  Counts Op Cache micro-tag
+                                                hit events.
+
+raw encoding of 0x0328F cannot be used since the upper nibble of the
+EventSelect bits have to be specified via bits 32-35 as can be seen with:
+
+  cat /sys/bus/event_source/devices/cpu/format/event
+
+raw encoding of 0x20000038F should be used instead:
+
+ perf stat -e r20000038f -a sleep 1
+ perf record -e r20000038f ...
+
+It's also possible to use pmu syntax:
+
+ perf record -e r20000038f -a sleep 1
+ perf record -e cpu/r20000038f/ ...
+ perf record -e cpu/r0x20000038f/ ...
+
 You should refer to the processor specific documentation for getting these
 details. Some of them are referenced in the SEE ALSO section below.
 
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 3cf7bac67239..55df7b073a55 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -30,8 +30,10 @@ OPTIONS
 
         - a symbolic event name	(use 'perf list' to list all events)
 
-        - a raw PMU event (eventsel+umask) in the form of rNNN where NNN is a
-	  hexadecimal event descriptor.
+        - a raw PMU event in the form of rN where N is a hexadecimal value
+          that represents the raw register encoding with the layout of the
+          event control registers as described by entries in
+          /sys/bus/event_sources/devices/cpu/format/*.
 
         - a symbolic or raw PMU event followed by an optional colon
 	  and a list of event modifiers, e.g., cpu-cycles:p.  See the
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 7e6fb7cbc0f4..604e6f2301ea 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -36,8 +36,10 @@ report::
 
 	- a symbolic event name (use 'perf list' to list all events)
 
-	- a raw PMU event (eventsel+umask) in the form of rNNN where NNN is a
-	  hexadecimal event descriptor.
+	- a raw PMU event in the form of rN where N is a hexadecimal value
+	  that represents the raw register encoding with the layout of the
+	  event control registers as described by entries in
+	  /sys/bus/event_sources/devices/cpu/format/*.
 
         - a symbolic or raw PMU event followed by an optional colon
 	  and a list of event modifiers, e.g., cpu-cycles:p.  See the
diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
index 9898a32b8d9c..cac3dfbee7d8 100644
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -38,9 +38,10 @@ Default is to monitor all CPUS.
 -e <event>::
 --event=<event>::
 	Select the PMU event. Selection can be a symbolic event name
-	(use 'perf list' to list all events) or a raw PMU
-	event (eventsel+umask) in the form of rNNN where NNN is a
-	hexadecimal event descriptor.
+	(use 'perf list' to list all events) or a raw PMU event in the form
+	of rN where N is a hexadecimal value that represents the raw register
+	encoding with the layout of the event control registers as described
+	by entries in /sys/bus/event_sources/devices/cpu/format/*.
 
 -E <entries>::
 --entries=<entries>::
-- 
cgit v1.2.3


From 7a2e14962cd43400c353cdc05550f580a284dcb9 Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan.das@amd.com>
Date: Tue, 23 Nov 2021 14:16:13 +0530
Subject: perf docs: Update link to AMD documentation

This updates the link to documentation on AMD processors.  The new link
points to a page where users can find the Processor Programming
Reference (PPR) documents for the family and model codes corresponding
to processors they are using.

Signed-off-by: Sandipan Das <sandipan.das@amd.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Cc: Ananth Narayan <ananth.narayan@amd.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kim Phillips <kim.phillips@amd.com>
Cc: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: Robert Richter <rrichter@amd.com>
Cc: Santosh Shukla <santosh.shukla@amd.com>
Link: https://lore.kernel.org/r/20211123084613.243792-2-sandipan.das@amd.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-list.txt | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt
index a922a95289a9..57384a97c04f 100644
--- a/tools/perf/Documentation/perf-list.txt
+++ b/tools/perf/Documentation/perf-list.txt
@@ -81,7 +81,11 @@ On AMD systems it is implemented using IBS (up to precise-level 2).
 The precise modifier works with event types 0x76 (cpu-cycles, CPU
 clocks not halted) and 0xC1 (micro-ops retired). Both events map to
 IBS execution sampling (IBS op) with the IBS Op Counter Control bit
-(IbsOpCntCtl) set respectively (see AMD64 Architecture Programmer’s
+(IbsOpCntCtl) set respectively (see the
+Core Complex (CCX) -> Processor x86 Core -> Instruction Based Sampling (IBS)
+section of the [AMD Processor Programming Reference (PPR)] relevant to the
+family, model and stepping of the processor being used).
+
 Manual Volume 2: System Programming, 13.3 Instruction-Based
 Sampling). Examples to use IBS:
 
@@ -96,8 +100,10 @@ it can be encoded in a per processor specific way.
 
 For instance on x86 CPUs, N is a hexadecimal value that represents the raw register encoding with the
 layout of IA32_PERFEVTSELx MSRs (see [Intel® 64 and IA-32 Architectures Software Developer's Manual Volume 3B: System Programming Guide] Figure 30-1 Layout
-of IA32_PERFEVTSELx MSRs) or AMD's PerfEvtSeln (see [AMD64 Architecture Programmer’s Manual Volume 2: System Programming], Page 344,
-Figure 13-7 Performance Event-Select Register (PerfEvtSeln)).
+of IA32_PERFEVTSELx MSRs) or AMD's PERF_CTL MSRs (see the
+Core Complex (CCX) -> Processor x86 Core -> MSR Registers section of the
+[AMD Processor Programming Reference (PPR)] relevant to the family, model
+and stepping of the processor being used).
 
 Note: Only the following bit fields can be set in x86 counter
 registers: event, umask, edge, inv, cmask. Esp. guest/host only and
@@ -348,4 +354,4 @@ SEE ALSO
 linkperf:perf-stat[1], linkperf:perf-top[1],
 linkperf:perf-record[1],
 http://www.intel.com/sdm/[Intel® 64 and IA-32 Architectures Software Developer's Manual Volume 3B: System Programming Guide],
-http://support.amd.com/us/Processor_TechDocs/24593_APM_v2.pdf[AMD64 Architecture Programmer’s Manual Volume 2: System Programming]
+https://bugzilla.kernel.org/show_bug.cgi?id=206537[AMD Processor Programming Reference (PPR)]
-- 
cgit v1.2.3


From b4515ad6e1c8b195e3bd02a5a15b1c74119ea367 Mon Sep 17 00:00:00 2001
From: Gang Li <ligang.bdlg@bytedance.com>
Date: Tue, 23 Nov 2021 15:40:17 +0800
Subject: perf trace: Enable ignore_missing_thread for trace

perf already support ignore_missing_thread for -u/-p, but not yet
applied to `perf trace`. This patch enables ignore_missing_thread
for `perf trace`.

Signed-off-by: Gang Li <ligang.bdlg@bytedance.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1481538943-21874-6-git-send-email-jolsa@kernel.org
Link: http://lkml.kernel.org/r/1513148513-6974-1-git-send-email-zhangmengting@huawei.com
Link: http://lore.kernel.org/lkml/20211123074018.11406-1-ligang.bdlg@bytedance.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-trace.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 0b52e08e558e..94d62a92f1a1 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -3950,6 +3950,9 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
 		evlist__add(evlist, pgfault_min);
 	}
 
+	/* Enable ignoring missing threads when -u/-p option is defined. */
+	trace->opts.ignore_missing_thread = trace->opts.target.uid != UINT_MAX || trace->opts.target.pid;
+
 	if (trace->sched &&
 	    evlist__add_newtp(evlist, "sched", "sched_stat_runtime", trace__sched_stat_runtime))
 		goto out_error_sched_stat_runtime;
-- 
cgit v1.2.3


From 6b6b16b3bb612757f7bc697496b9f5d6765512a6 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 23 Nov 2021 17:52:26 -0800
Subject: perf metric: Reduce multiplexing with duration_time

It is common to use the same counters with and without duration_time.
The ID sharing code treats duration_time as if it were a hardware event
placed in the same group. This causes unnecessary multiplexing such as
in the following example where l3_cache_access isn't shared:

  $ perf stat -M l3 -a sleep 1

   Performance counter stats for 'system wide':

         3,117,007      l3_cache_miss         #    199.5 MB/s  l3_rd_bw
                                              #     43.6 %  l3_hits
                                              #     56.4 %  l3_miss                 (50.00%)
         5,526,447      l3_cache_access                                             (50.00%)
         5,392,435      l3_cache_access       # 5389191.2 access/s  l3_access_rate  (50.00%)
     1,000,601,901 ns   duration_time

       1.000601901 seconds time elapsed

Fix this by placing duration_time in all groups unless metric
sharing has been disabled on the command line:

  $ perf stat -M l3 -a sleep 1

   Performance counter stats for 'system wide':

         3,597,972      l3_cache_miss         #    230.3 MB/s  l3_rd_bw
                                              #     48.0 %  l3_hits
                                              #     52.0 %  l3_miss
         6,914,459      l3_cache_access       # 6909935.9 access/s  l3_access_rate
     1,000,654,579 ns   duration_time

       1.000654579 seconds time elapsed

  $ perf stat --metric-no-merge -M l3 -a sleep 1

   Performance counter stats for 'system wide':

         3,501,834      l3_cache_miss         #     53.5 %  l3_miss                (24.99%)
         6,548,173      l3_cache_access                                            (24.99%)
         3,417,622      l3_cache_miss         #     45.7 %  l3_hits                (25.04%)
         6,294,062      l3_cache_access                                            (25.04%)
         5,923,238      l3_cache_access       # 5919688.1 access/s  l3_access_rate (24.99%)
     1,000,599,683 ns   duration_time
         3,607,486      l3_cache_miss         #    230.9 MB/s  l3_rd_bw            (49.97%)

       1.000599683 seconds time elapsed

v2. Doesn't count duration_time in the metric_list_cmp function that
    sorts larger metrics first. Without this a metric with duration_time
    and an event is sorted the same as a metric with two events,
    possibly not allowing the first metric to share with the second.

Signed-off-by: Ian Rogers <irogers@google.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lore.kernel.org/lkml/20211124015226.3317994-1-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/metricgroup.c | 42 +++++++++++++++++++++++++++++++++---------
 1 file changed, 33 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
index fffe02aae3ed..51c99cb08abf 100644
--- a/tools/perf/util/metricgroup.c
+++ b/tools/perf/util/metricgroup.c
@@ -1115,13 +1115,27 @@ out:
 	return ret;
 }
 
+/**
+ * metric_list_cmp - list_sort comparator that sorts metrics with more events to
+ *                   the front. duration_time is excluded from the count.
+ */
 static int metric_list_cmp(void *priv __maybe_unused, const struct list_head *l,
 			   const struct list_head *r)
 {
 	const struct metric *left = container_of(l, struct metric, nd);
 	const struct metric *right = container_of(r, struct metric, nd);
+	struct expr_id_data *data;
+	int left_count, right_count;
+
+	left_count = hashmap__size(left->pctx->ids);
+	if (!expr__get_id(left->pctx, "duration_time", &data))
+		left_count--;
+
+	right_count = hashmap__size(right->pctx->ids);
+	if (!expr__get_id(right->pctx, "duration_time", &data))
+		right_count--;
 
-	return hashmap__size(right->pctx->ids) - hashmap__size(left->pctx->ids);
+	return right_count - left_count;
 }
 
 /**
@@ -1299,14 +1313,16 @@ err_out:
 /**
  * parse_ids - Build the event string for the ids and parse them creating an
  *             evlist. The encoded metric_ids are decoded.
+ * @metric_no_merge: is metric sharing explicitly disabled.
  * @fake_pmu: used when testing metrics not supported by the current CPU.
  * @ids: the event identifiers parsed from a metric.
  * @modifier: any modifiers added to the events.
  * @has_constraint: false if events should be placed in a weak group.
  * @out_evlist: the created list of events.
  */
-static int parse_ids(struct perf_pmu *fake_pmu, struct expr_parse_ctx *ids,
-		     const char *modifier, bool has_constraint, struct evlist **out_evlist)
+static int parse_ids(bool metric_no_merge, struct perf_pmu *fake_pmu,
+		     struct expr_parse_ctx *ids, const char *modifier,
+		     bool has_constraint, struct evlist **out_evlist)
 {
 	struct parse_events_error parse_error;
 	struct evlist *parsed_evlist;
@@ -1314,12 +1330,19 @@ static int parse_ids(struct perf_pmu *fake_pmu, struct expr_parse_ctx *ids,
 	int ret;
 
 	*out_evlist = NULL;
-	if (hashmap__size(ids->ids) == 0) {
+	if (!metric_no_merge || hashmap__size(ids->ids) == 0) {
 		char *tmp;
 		/*
-		 * No ids/events in the expression parsing context. Events may
-		 * have been removed because of constant evaluation, e.g.:
-		 *  event1 if #smt_on else 0
+		 * We may fail to share events between metrics because
+		 * duration_time isn't present in one metric. For example, a
+		 * ratio of cache misses doesn't need duration_time but the same
+		 * events may be used for a misses per second. Events without
+		 * sharing implies multiplexing, that is best avoided, so place
+		 * duration_time in every group.
+		 *
+		 * Also, there may be no ids/events in the expression parsing
+		 * context because of constant evaluation, e.g.:
+		 *    event1 if #smt_on else 0
 		 * Add a duration_time event to avoid a parse error on an empty
 		 * string.
 		 */
@@ -1387,7 +1410,8 @@ static int parse_groups(struct evlist *perf_evlist, const char *str,
 		ret = build_combined_expr_ctx(&metric_list, &combined);
 
 		if (!ret && combined && hashmap__size(combined->ids)) {
-			ret = parse_ids(fake_pmu, combined, /*modifier=*/NULL,
+			ret = parse_ids(metric_no_merge, fake_pmu, combined,
+					/*modifier=*/NULL,
 					/*has_constraint=*/true,
 					&combined_evlist);
 		}
@@ -1435,7 +1459,7 @@ static int parse_groups(struct evlist *perf_evlist, const char *str,
 			}
 		}
 		if (!metric_evlist) {
-			ret = parse_ids(fake_pmu, m->pctx, m->modifier,
+			ret = parse_ids(metric_no_merge, fake_pmu, m->pctx, m->modifier,
 					m->has_constraint, &m->evlist);
 			if (ret)
 				goto out;
-- 
cgit v1.2.3


From ecdcf630d71f3b4c64097cad0add561cd5010c02 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 30 Nov 2021 09:49:44 -0800
Subject: perf evlist: Allow setting arbitrary leader

The leader of a group is the first, but allow it to be an arbitrary list
member so that for Intel topdown events slots may always be the group
leader.

Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Link: http://lore.kernel.org/lkml/20211130174945.247604-1-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/perf/evlist.c                  | 15 +++++++++------
 tools/lib/perf/include/internal/evlist.h |  2 +-
 tools/perf/util/parse-events.c           |  4 ++--
 3 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/perf/evlist.c b/tools/lib/perf/evlist.c
index e37dfad31383..245acbc53bd3 100644
--- a/tools/lib/perf/evlist.c
+++ b/tools/lib/perf/evlist.c
@@ -643,14 +643,14 @@ perf_evlist__next_mmap(struct perf_evlist *evlist, struct perf_mmap *map,
 	return overwrite ? evlist->mmap_ovw_first : evlist->mmap_first;
 }
 
-void __perf_evlist__set_leader(struct list_head *list)
+void __perf_evlist__set_leader(struct list_head *list, struct perf_evsel *leader)
 {
-	struct perf_evsel *evsel, *leader;
+	struct perf_evsel *first, *last, *evsel;
 
-	leader = list_entry(list->next, struct perf_evsel, node);
-	evsel = list_entry(list->prev, struct perf_evsel, node);
+	first = list_first_entry(list, struct perf_evsel, node);
+	last = list_last_entry(list, struct perf_evsel, node);
 
-	leader->nr_members = evsel->idx - leader->idx + 1;
+	leader->nr_members = last->idx - first->idx + 1;
 
 	__perf_evlist__for_each_entry(list, evsel)
 		evsel->leader = leader;
@@ -659,7 +659,10 @@ void __perf_evlist__set_leader(struct list_head *list)
 void perf_evlist__set_leader(struct perf_evlist *evlist)
 {
 	if (evlist->nr_entries) {
+		struct perf_evsel *first = list_entry(evlist->entries.next,
+						struct perf_evsel, node);
+
 		evlist->nr_groups = evlist->nr_entries > 1 ? 1 : 0;
-		__perf_evlist__set_leader(&evlist->entries);
+		__perf_evlist__set_leader(&evlist->entries, first);
 	}
 }
diff --git a/tools/lib/perf/include/internal/evlist.h b/tools/lib/perf/include/internal/evlist.h
index f366dbad6a88..6f74269a3ad4 100644
--- a/tools/lib/perf/include/internal/evlist.h
+++ b/tools/lib/perf/include/internal/evlist.h
@@ -127,5 +127,5 @@ int perf_evlist__id_add_fd(struct perf_evlist *evlist,
 
 void perf_evlist__reset_id_hash(struct perf_evlist *evlist);
 
-void __perf_evlist__set_leader(struct list_head *list);
+void __perf_evlist__set_leader(struct list_head *list, struct perf_evsel *leader);
 #endif /* __LIBPERF_INTERNAL_EVLIST_H */
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index ba74fdf74af9..1d68167ab611 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -1837,8 +1837,8 @@ void parse_events__set_leader(char *name, struct list_head *list,
 	if (parse_events__set_leader_for_uncore_aliase(name, list, parse_state))
 		return;
 
-	__perf_evlist__set_leader(list);
-	leader = list_entry(list->next, struct evsel, core.node);
+	leader = list_first_entry(list, struct evsel, core.node);
+	__perf_evlist__set_leader(list, &leader->core);
 	leader->group_name = name ? strdup(name) : NULL;
 }
 
-- 
cgit v1.2.3


From 94dbfd6781a0e87b6faa6012810eb22e7d5b8a70 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 30 Nov 2021 09:49:45 -0800
Subject: perf parse-events: Architecture specific leader override

Currently topdown events must appear after a slots event:

  $ perf stat -e '{slots,topdown-fe-bound}' /bin/true

   Performance counter stats for '/bin/true':

         3,183,090      slots
           986,133      topdown-fe-bound

Reversing the events yields:

  $ perf stat -e '{topdown-fe-bound,slots}' /bin/true
  Error:
  The sys_perf_event_open() syscall returned with 22 (Invalid argument) for event (topdown-fe-bound).

For metrics the order of events is determined by iterating over a
hashmap, and so slots isn't guaranteed to be first which can yield this
error.

Change the set_leader in parse-events, called when a group is closed, so
that rather than always making the first event the leader, if the slots
event exists then it is made the leader. It is then moved to the head of
the evlist otherwise it won't be opened in the correct order.

The result is:

  $ perf stat -e '{topdown-fe-bound,slots}' /bin/true

   Performance counter stats for '/bin/true':

         3,274,795      slots
         1,001,702      topdown-fe-bound

A problem with this approach is the slots event is identified by name,
names can be overwritten like 'cpu/slots,name=foo/' and this causes the
leader change to fail.

The change also modifies and fixes mixed groups like, with the change:

  $ perf stat -e '{instructions,slots,topdown-fe-bound}' -a -- sleep 2

   Performance counter stats for 'system wide':

        5574985410      slots
         971981616      instructions
        1348461887      topdown-fe-bound

       2.001263120 seconds time elapsed

Without the change:

  $ perf stat -e '{instructions,slots,topdown-fe-bound}' -a -- sleep 2

   Performance counter stats for 'system wide':

     <not counted>      instructions
     <not counted>      slots
   <not supported>      topdown-fe-bound

       2.006247990 seconds time elapsed

Something that may be undesirable here is that the events are reordered
in the output.

Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Link: http://lore.kernel.org/lkml/20211130174945.247604-2-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/x86/util/evlist.c | 17 +++++++++++++++++
 tools/perf/util/evlist.h          |  1 +
 tools/perf/util/parse-events.c    |  8 +++++++-
 3 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/arch/x86/util/evlist.c b/tools/perf/arch/x86/util/evlist.c
index 0b0951030a2f..f924246eff78 100644
--- a/tools/perf/arch/x86/util/evlist.c
+++ b/tools/perf/arch/x86/util/evlist.c
@@ -17,3 +17,20 @@ int arch_evlist__add_default_attrs(struct evlist *evlist)
 	else
 		return parse_events(evlist, TOPDOWN_L1_EVENTS, NULL);
 }
+
+struct evsel *arch_evlist__leader(struct list_head *list)
+{
+	struct evsel *evsel, *first;
+
+	first = list_first_entry(list, struct evsel, core.node);
+
+	if (!pmu_have_event("cpu", "slots"))
+		return first;
+
+	__evlist__for_each_entry(list, evsel) {
+		if (evsel->pmu_name && !strcmp(evsel->pmu_name, "cpu") &&
+			evsel->name && strstr(evsel->name, "slots"))
+			return evsel;
+	}
+	return first;
+}
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 97bfb8d0be4f..993437ffe429 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -110,6 +110,7 @@ int __evlist__add_default_attrs(struct evlist *evlist,
 	__evlist__add_default_attrs(evlist, array, ARRAY_SIZE(array))
 
 int arch_evlist__add_default_attrs(struct evlist *evlist);
+struct evsel *arch_evlist__leader(struct list_head *list);
 
 int evlist__add_dummy(struct evlist *evlist);
 
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 1d68167ab611..acf20ce98ce9 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -1824,6 +1824,11 @@ out:
 	return ret;
 }
 
+__weak struct evsel *arch_evlist__leader(struct list_head *list)
+{
+	return list_first_entry(list, struct evsel, core.node);
+}
+
 void parse_events__set_leader(char *name, struct list_head *list,
 			      struct parse_events_state *parse_state)
 {
@@ -1837,9 +1842,10 @@ void parse_events__set_leader(char *name, struct list_head *list,
 	if (parse_events__set_leader_for_uncore_aliase(name, list, parse_state))
 		return;
 
-	leader = list_first_entry(list, struct evsel, core.node);
+	leader = arch_evlist__leader(list);
 	__perf_evlist__set_leader(list, &leader->core);
 	leader->group_name = name ? strdup(name) : NULL;
+	list_move(&leader->core.node, list);
 }
 
 /* list_event is assumed to point to malloc'ed memory */
-- 
cgit v1.2.3


From ed17b1914978eddb2b01f2d34577f1c82518c650 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <uwe@kleine-koenig.org>
Date: Fri, 3 Dec 2021 22:05:44 +0100
Subject: perf tools: Drop requirement for libstdc++.so for libopencsd check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It's possible to link against libopencsd_c_api without having
libstdc++.so available, only libstdc++.so.6.0.28 (or whatever version is
in use) needs to be available. The same holds true for libopencsd.so.
When -lstdc++ (or -lopencsd) is explicitly passed to the linker however
the .so file must be available.

So wrap adding the dependencies into a check for static linking that
actually requires adding them all. The same construct is already used
for some other tests in the same file to reduce dependencies in the
dynamic linking case.

Fixes: 573cf5c9a152 ("perf build: Add missing -lstdc++ when linking with libopencsd")
Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Uwe Kleine-König <uwe@kleine-koenig.org>
Cc: Adrian Bunk <bunk@debian.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Branislav Rankov <branislav.rankov@arm.com>
Cc: Diederik de Haas <didi.debian@cknow.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/all/20211203210544.1137935-1-uwe@kleine-koenig.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile.config | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 94bb53b0cebd..96ad944ca6a8 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -144,7 +144,10 @@ FEATURE_CHECK_LDFLAGS-libcrypto = -lcrypto
 ifdef CSINCLUDES
   LIBOPENCSD_CFLAGS := -I$(CSINCLUDES)
 endif
-OPENCSDLIBS := -lopencsd_c_api -lopencsd -lstdc++
+OPENCSDLIBS := -lopencsd_c_api
+ifeq ($(findstring -static,${LDFLAGS}),-static)
+  OPENCSDLIBS += -lopencsd -lstdc++
+endif
 ifdef CSLIBS
   LIBOPENCSD_LDFLAGS := -L$(CSLIBS)
 endif
-- 
cgit v1.2.3


From e69dc84282fb474cb87097c6c945d8f90e05a4d9 Mon Sep 17 00:00:00 2001
From: Jin Yao <yao.jin@linux.intel.com>
Date: Thu, 9 Sep 2021 14:22:15 +0800
Subject: perf stat: Support --cputype option for hybrid events

In previous patch, we have supported the syntax which enables
the event on a specified pmu, such as:

cpu_core/<event>/
cpu_atom/<event>/

While this syntax is not very easy for applying on a set of
events or applying on a group. In following example, we have to
explicitly assign the pmu prefix.

  # ./perf stat -e '{cpu_core/cycles/,cpu_core/instructions/}' -- sleep 1

   Performance counter stats for 'sleep 1':

           1,158,545      cpu_core/cycles/
           1,003,113      cpu_core/instructions/

         1.002428712 seconds time elapsed

A much easier way is:

  # ./perf stat --cputype core -e '{cycles,instructions}' -- sleep 1

   Performance counter stats for 'sleep 1':

           1,101,071      cpu_core/cycles/
             939,892      cpu_core/instructions/

         1.002363142 seconds time elapsed

For this example, the '--cputype' enables the events from specified
pmu (cpu_core).

If '--cputype' conflicts with pmu prefix, '--cputype' is ignored.

  # ./perf stat --cputype core -e cycles,cpu_atom/instructions/ -a -- sleep 1

   Performance counter stats for 'system wide':

          21,003,407      cpu_core/cycles/
             367,886      cpu_atom/instructions/

         1.002203520 seconds time elapsed

Signed-off-by: Jin Yao <yao.jin@linux.intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jin Yao <yao.jin@intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lore.kernel.org/lkml/20210909062215.10278-1-yao.jin@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-stat.txt |  4 ++++
 tools/perf/builtin-stat.c              | 24 ++++++++++++++++++++++++
 tools/perf/util/evlist.h               |  1 +
 tools/perf/util/parse-events-hybrid.c  |  9 ++++++---
 4 files changed, 35 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 604e6f2301ea..c06c341e72b9 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -495,6 +495,10 @@ This option can be enabled in perf config by setting the variable
 
 $ perf config stat.no-csv-summary=true
 
+--cputype::
+Only enable events on applying cpu with this type for hybrid platform
+(e.g. core or atom)"
+
 EXAMPLES
 --------
 
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 7974933dbc77..f6ca2b054c5b 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1168,6 +1168,26 @@ static int parse_stat_cgroups(const struct option *opt,
 	return parse_cgroups(opt, str, unset);
 }
 
+static int parse_hybrid_type(const struct option *opt,
+			     const char *str,
+			     int unset __maybe_unused)
+{
+	struct evlist *evlist = *(struct evlist **)opt->value;
+
+	if (!list_empty(&evlist->core.entries)) {
+		fprintf(stderr, "Must define cputype before events/metrics\n");
+		return -1;
+	}
+
+	evlist->hybrid_pmu_name = perf_pmu__hybrid_type_to_pmu(str);
+	if (!evlist->hybrid_pmu_name) {
+		fprintf(stderr, "--cputype %s is not supported!\n", str);
+		return -1;
+	}
+
+	return 0;
+}
+
 static struct option stat_options[] = {
 	OPT_BOOLEAN('T', "transaction", &transaction_run,
 		    "hardware transaction statistics"),
@@ -1282,6 +1302,10 @@ static struct option stat_options[] = {
 		       "don't print 'summary' for CSV summary output"),
 	OPT_BOOLEAN(0, "quiet", &stat_config.quiet,
 			"don't print output (useful with record)"),
+	OPT_CALLBACK(0, "cputype", &evsel_list, "hybrid cpu type",
+		     "Only enable events on applying cpu with this type "
+		     "for hybrid platform (e.g. core or atom)",
+		     parse_hybrid_type),
 #ifdef HAVE_LIBPFM
 	OPT_CALLBACK(0, "pfm-events", &evsel_list, "event",
 		"libpfm4 event selector. use 'perf list' to list available events",
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 993437ffe429..27594900a052 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -64,6 +64,7 @@ struct evlist {
 	struct evsel *selected;
 	struct events_stats stats;
 	struct perf_env	*env;
+	const char *hybrid_pmu_name;
 	void (*trace_event_sample_raw)(struct evlist *evlist,
 				       union perf_event *event,
 				       struct perf_sample *sample);
diff --git a/tools/perf/util/parse-events-hybrid.c b/tools/perf/util/parse-events-hybrid.c
index 9fc86971027b..284f8eabd3b9 100644
--- a/tools/perf/util/parse-events-hybrid.c
+++ b/tools/perf/util/parse-events-hybrid.c
@@ -63,10 +63,13 @@ static int create_event_hybrid(__u32 config_type, int *idx,
 static int pmu_cmp(struct parse_events_state *parse_state,
 		   struct perf_pmu *pmu)
 {
-	if (!parse_state->hybrid_pmu_name)
-		return 0;
+	if (parse_state->evlist && parse_state->evlist->hybrid_pmu_name)
+		return strcmp(parse_state->evlist->hybrid_pmu_name, pmu->name);
+
+	if (parse_state->hybrid_pmu_name)
+		return strcmp(parse_state->hybrid_pmu_name, pmu->name);
 
-	return strcmp(parse_state->hybrid_pmu_name, pmu->name);
+	return 0;
 }
 
 static int add_hw_hybrid(struct parse_events_state *parse_state,
-- 
cgit v1.2.3


From 8ff4f20f3eb55dea0dbbe5e32043ab6b7427882f Mon Sep 17 00:00:00 2001
From: Andrew Kilroy <andrew.kilroy@arm.com>
Date: Fri, 3 Dec 2021 12:35:22 +0000
Subject: perf vendor events arm64: Fix JSON indentation to 4 spaces standard

Correct indentation to 4 spaces, same as the other JSON files.

Reviewed-by: John Garry <john.garry@huawei.com>
Signed-off-by: Andrew Kilroy <andrew.kilroy@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Link: http://lore.kernel.org/lkml/20211203123525.31127-2-andrew.kilroy@arm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../pmu-events/arch/arm64/armv8-recommended.json   | 202 ++++++++++-----------
 1 file changed, 101 insertions(+), 101 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/arch/arm64/armv8-recommended.json b/tools/perf/pmu-events/arch/arm64/armv8-recommended.json
index d0a19866563d..210afa856091 100644
--- a/tools/perf/pmu-events/arch/arm64/armv8-recommended.json
+++ b/tools/perf/pmu-events/arch/arm64/armv8-recommended.json
@@ -148,305 +148,305 @@
         "EventCode": "0x60",
         "EventName": "BUS_ACCESS_RD",
         "BriefDescription": "Bus access read"
-   },
-   {
+    },
+    {
         "PublicDescription": "Bus access write",
         "EventCode": "0x61",
         "EventName": "BUS_ACCESS_WR",
         "BriefDescription": "Bus access write"
-   },
-   {
+    },
+    {
         "PublicDescription": "Bus access, Normal, Cacheable, Shareable",
         "EventCode": "0x62",
         "EventName": "BUS_ACCESS_SHARED",
         "BriefDescription": "Bus access, Normal, Cacheable, Shareable"
-   },
-   {
+    },
+    {
         "PublicDescription": "Bus access, not Normal, Cacheable, Shareable",
         "EventCode": "0x63",
         "EventName": "BUS_ACCESS_NOT_SHARED",
         "BriefDescription": "Bus access, not Normal, Cacheable, Shareable"
-   },
-   {
+    },
+    {
         "PublicDescription": "Bus access, Normal",
         "EventCode": "0x64",
         "EventName": "BUS_ACCESS_NORMAL",
         "BriefDescription": "Bus access, Normal"
-   },
-   {
+    },
+    {
         "PublicDescription": "Bus access, peripheral",
         "EventCode": "0x65",
         "EventName": "BUS_ACCESS_PERIPH",
         "BriefDescription": "Bus access, peripheral"
-   },
-   {
+    },
+    {
         "PublicDescription": "Data memory access, read",
         "EventCode": "0x66",
         "EventName": "MEM_ACCESS_RD",
         "BriefDescription": "Data memory access, read"
-   },
-   {
+    },
+    {
         "PublicDescription": "Data memory access, write",
         "EventCode": "0x67",
         "EventName": "MEM_ACCESS_WR",
         "BriefDescription": "Data memory access, write"
-   },
-   {
+    },
+    {
         "PublicDescription": "Unaligned access, read",
         "EventCode": "0x68",
         "EventName": "UNALIGNED_LD_SPEC",
         "BriefDescription": "Unaligned access, read"
-   },
-   {
+    },
+    {
         "PublicDescription": "Unaligned access, write",
         "EventCode": "0x69",
         "EventName": "UNALIGNED_ST_SPEC",
         "BriefDescription": "Unaligned access, write"
-   },
-   {
+    },
+    {
         "PublicDescription": "Unaligned access",
         "EventCode": "0x6a",
         "EventName": "UNALIGNED_LDST_SPEC",
         "BriefDescription": "Unaligned access"
-   },
-   {
+    },
+    {
         "PublicDescription": "Exclusive operation speculatively executed, LDREX or LDX",
         "EventCode": "0x6c",
         "EventName": "LDREX_SPEC",
         "BriefDescription": "Exclusive operation speculatively executed, LDREX or LDX"
-   },
-   {
+    },
+    {
         "PublicDescription": "Exclusive operation speculatively executed, STREX or STX pass",
         "EventCode": "0x6d",
         "EventName": "STREX_PASS_SPEC",
         "BriefDescription": "Exclusive operation speculatively executed, STREX or STX pass"
-   },
-   {
+    },
+    {
         "PublicDescription": "Exclusive operation speculatively executed, STREX or STX fail",
         "EventCode": "0x6e",
         "EventName": "STREX_FAIL_SPEC",
         "BriefDescription": "Exclusive operation speculatively executed, STREX or STX fail"
-   },
-   {
+    },
+    {
         "PublicDescription": "Exclusive operation speculatively executed, STREX or STX",
         "EventCode": "0x6f",
         "EventName": "STREX_SPEC",
         "BriefDescription": "Exclusive operation speculatively executed, STREX or STX"
-   },
-   {
+    },
+    {
         "PublicDescription": "Operation speculatively executed, load",
         "EventCode": "0x70",
         "EventName": "LD_SPEC",
         "BriefDescription": "Operation speculatively executed, load"
-   },
-   {
+    },
+    {
         "PublicDescription": "Operation speculatively executed, store",
         "EventCode": "0x71",
         "EventName": "ST_SPEC",
         "BriefDescription": "Operation speculatively executed, store"
-   },
-   {
+    },
+    {
         "PublicDescription": "Operation speculatively executed, load or store",
         "EventCode": "0x72",
         "EventName": "LDST_SPEC",
         "BriefDescription": "Operation speculatively executed, load or store"
-   },
-   {
+    },
+    {
         "PublicDescription": "Operation speculatively executed, integer data processing",
         "EventCode": "0x73",
         "EventName": "DP_SPEC",
         "BriefDescription": "Operation speculatively executed, integer data processing"
-   },
-   {
+    },
+    {
         "PublicDescription": "Operation speculatively executed, Advanced SIMD instruction",
         "EventCode": "0x74",
         "EventName": "ASE_SPEC",
         "BriefDescription": "Operation speculatively executed, Advanced SIMD instruction"
-   },
-   {
+    },
+    {
         "PublicDescription": "Operation speculatively executed, floating-point instruction",
         "EventCode": "0x75",
         "EventName": "VFP_SPEC",
         "BriefDescription": "Operation speculatively executed, floating-point instruction"
-   },
-   {
+    },
+    {
         "PublicDescription": "Operation speculatively executed, software change of the PC",
         "EventCode": "0x76",
         "EventName": "PC_WRITE_SPEC",
         "BriefDescription": "Operation speculatively executed, software change of the PC"
-   },
-   {
+    },
+    {
         "PublicDescription": "Operation speculatively executed, Cryptographic instruction",
         "EventCode": "0x77",
         "EventName": "CRYPTO_SPEC",
         "BriefDescription": "Operation speculatively executed, Cryptographic instruction"
-   },
-   {
+    },
+    {
         "PublicDescription": "Branch speculatively executed, immediate branch",
         "EventCode": "0x78",
         "EventName": "BR_IMMED_SPEC",
         "BriefDescription": "Branch speculatively executed, immediate branch"
-   },
-   {
+    },
+    {
         "PublicDescription": "Branch speculatively executed, procedure return",
         "EventCode": "0x79",
         "EventName": "BR_RETURN_SPEC",
         "BriefDescription": "Branch speculatively executed, procedure return"
-   },
-   {
+    },
+    {
         "PublicDescription": "Branch speculatively executed, indirect branch",
         "EventCode": "0x7a",
         "EventName": "BR_INDIRECT_SPEC",
         "BriefDescription": "Branch speculatively executed, indirect branch"
-   },
-   {
+    },
+    {
         "PublicDescription": "Barrier speculatively executed, ISB",
         "EventCode": "0x7c",
         "EventName": "ISB_SPEC",
         "BriefDescription": "Barrier speculatively executed, ISB"
-   },
-   {
+    },
+    {
         "PublicDescription": "Barrier speculatively executed, DSB",
         "EventCode": "0x7d",
         "EventName": "DSB_SPEC",
         "BriefDescription": "Barrier speculatively executed, DSB"
-   },
-   {
+    },
+    {
         "PublicDescription": "Barrier speculatively executed, DMB",
         "EventCode": "0x7e",
         "EventName": "DMB_SPEC",
         "BriefDescription": "Barrier speculatively executed, DMB"
-   },
-   {
+    },
+    {
         "PublicDescription": "Exception taken, Other synchronous",
         "EventCode": "0x81",
         "EventName": "EXC_UNDEF",
         "BriefDescription": "Exception taken, Other synchronous"
-   },
-   {
+    },
+    {
         "PublicDescription": "Exception taken, Supervisor Call",
         "EventCode": "0x82",
         "EventName": "EXC_SVC",
         "BriefDescription": "Exception taken, Supervisor Call"
-   },
-   {
+    },
+    {
         "PublicDescription": "Exception taken, Instruction Abort",
         "EventCode": "0x83",
         "EventName": "EXC_PABORT",
         "BriefDescription": "Exception taken, Instruction Abort"
-   },
-   {
+    },
+    {
         "PublicDescription": "Exception taken, Data Abort and SError",
         "EventCode": "0x84",
         "EventName": "EXC_DABORT",
         "BriefDescription": "Exception taken, Data Abort and SError"
-   },
-   {
+    },
+    {
         "PublicDescription": "Exception taken, IRQ",
         "EventCode": "0x86",
         "EventName": "EXC_IRQ",
         "BriefDescription": "Exception taken, IRQ"
-   },
-   {
+    },
+    {
         "PublicDescription": "Exception taken, FIQ",
         "EventCode": "0x87",
         "EventName": "EXC_FIQ",
         "BriefDescription": "Exception taken, FIQ"
-   },
-   {
+    },
+    {
         "PublicDescription": "Exception taken, Secure Monitor Call",
         "EventCode": "0x88",
         "EventName": "EXC_SMC",
         "BriefDescription": "Exception taken, Secure Monitor Call"
-   },
-   {
+    },
+    {
         "PublicDescription": "Exception taken, Hypervisor Call",
         "EventCode": "0x8a",
         "EventName": "EXC_HVC",
         "BriefDescription": "Exception taken, Hypervisor Call"
-   },
-   {
+    },
+    {
         "PublicDescription": "Exception taken, Instruction Abort not taken locally",
         "EventCode": "0x8b",
         "EventName": "EXC_TRAP_PABORT",
         "BriefDescription": "Exception taken, Instruction Abort not taken locally"
-   },
-   {
+    },
+    {
         "PublicDescription": "Exception taken, Data Abort or SError not taken locally",
         "EventCode": "0x8c",
         "EventName": "EXC_TRAP_DABORT",
         "BriefDescription": "Exception taken, Data Abort or SError not taken locally"
-   },
-   {
+    },
+    {
         "PublicDescription": "Exception taken, Other traps not taken locally",
         "EventCode": "0x8d",
         "EventName": "EXC_TRAP_OTHER",
         "BriefDescription": "Exception taken, Other traps not taken locally"
-   },
-   {
+    },
+    {
         "PublicDescription": "Exception taken, IRQ not taken locally",
         "EventCode": "0x8e",
         "EventName": "EXC_TRAP_IRQ",
         "BriefDescription": "Exception taken, IRQ not taken locally"
-   },
-   {
+    },
+    {
         "PublicDescription": "Exception taken, FIQ not taken locally",
         "EventCode": "0x8f",
         "EventName": "EXC_TRAP_FIQ",
         "BriefDescription": "Exception taken, FIQ not taken locally"
-   },
-   {
+    },
+    {
         "PublicDescription": "Release consistency operation speculatively executed, Load-Acquire",
         "EventCode": "0x90",
         "EventName": "RC_LD_SPEC",
         "BriefDescription": "Release consistency operation speculatively executed, Load-Acquire"
-   },
-   {
+    },
+    {
         "PublicDescription": "Release consistency operation speculatively executed, Store-Release",
         "EventCode": "0x91",
         "EventName": "RC_ST_SPEC",
         "BriefDescription": "Release consistency operation speculatively executed, Store-Release"
-   },
-   {
+    },
+    {
         "PublicDescription": "Attributable Level 3 data or unified cache access, read",
         "EventCode": "0xa0",
         "EventName": "L3D_CACHE_RD",
         "BriefDescription": "Attributable Level 3 data or unified cache access, read"
-   },
-   {
+    },
+    {
         "PublicDescription": "Attributable Level 3 data or unified cache access, write",
         "EventCode": "0xa1",
         "EventName": "L3D_CACHE_WR",
         "BriefDescription": "Attributable Level 3 data or unified cache access, write"
-   },
-   {
+    },
+    {
         "PublicDescription": "Attributable Level 3 data or unified cache refill, read",
         "EventCode": "0xa2",
         "EventName": "L3D_CACHE_REFILL_RD",
         "BriefDescription": "Attributable Level 3 data or unified cache refill, read"
-   },
-   {
+    },
+    {
         "PublicDescription": "Attributable Level 3 data or unified cache refill, write",
         "EventCode": "0xa3",
         "EventName": "L3D_CACHE_REFILL_WR",
         "BriefDescription": "Attributable Level 3 data or unified cache refill, write"
-   },
-   {
+    },
+    {
         "PublicDescription": "Attributable Level 3 data or unified cache Write-Back, victim",
         "EventCode": "0xa6",
         "EventName": "L3D_CACHE_WB_VICTIM",
         "BriefDescription": "Attributable Level 3 data or unified cache Write-Back, victim"
-   },
-   {
+    },
+    {
         "PublicDescription": "Attributable Level 3 data or unified cache Write-Back, cache clean",
         "EventCode": "0xa7",
         "EventName": "L3D_CACHE_WB_CLEAN",
         "BriefDescription": "Attributable Level 3 data or unified cache Write-Back, cache clean"
-   },
-   {
+    },
+    {
         "PublicDescription": "Attributable Level 3 data or unified cache access, invalidate",
         "EventCode": "0xa8",
         "EventName": "L3D_CACHE_INVAL",
         "BriefDescription": "Attributable Level 3 data or unified cache access, invalidate"
-   }
+    }
 ]
-- 
cgit v1.2.3


From d76c51f976ed1095dcd8c5c85ec9d8fed77a3e05 Mon Sep 17 00:00:00 2001
From: Vadim Fedorenko <vfedorenko@novek.ru>
Date: Tue, 7 Dec 2021 00:39:31 +0300
Subject: selftests: tls: add missing AES-CCM cipher tests

Add tests for TLSv1.2 and TLSv1.3 with AES-CCM cipher.

Signed-off-by: Vadim Fedorenko <vfedorenko@novek.ru>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/tls.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c
index 8a22db0cca49..fb1bb402ee10 100644
--- a/tools/testing/selftests/net/tls.c
+++ b/tools/testing/selftests/net/tls.c
@@ -31,6 +31,7 @@ struct tls_crypto_info_keys {
 		struct tls12_crypto_info_chacha20_poly1305 chacha20;
 		struct tls12_crypto_info_sm4_gcm sm4gcm;
 		struct tls12_crypto_info_sm4_ccm sm4ccm;
+		struct tls12_crypto_info_aes_ccm_128 aesccm128;
 	};
 	size_t len;
 };
@@ -61,6 +62,11 @@ static void tls_crypto_info_init(uint16_t tls_version, uint16_t cipher_type,
 		tls12->sm4ccm.info.version = tls_version;
 		tls12->sm4ccm.info.cipher_type = cipher_type;
 		break;
+	case TLS_CIPHER_AES_CCM_128:
+		tls12->len = sizeof(struct tls12_crypto_info_aes_ccm_128);
+		tls12->aesccm128.info.version = tls_version;
+		tls12->aesccm128.info.cipher_type = cipher_type;
+		break;
 	default:
 		break;
 	}
@@ -261,6 +267,18 @@ FIXTURE_VARIANT_ADD(tls, 13_sm4_ccm)
 	.cipher_type = TLS_CIPHER_SM4_CCM,
 };
 
+FIXTURE_VARIANT_ADD(tls, 12_aes_ccm)
+{
+	.tls_version = TLS_1_2_VERSION,
+	.cipher_type = TLS_CIPHER_AES_CCM_128,
+};
+
+FIXTURE_VARIANT_ADD(tls, 13_aes_ccm)
+{
+	.tls_version = TLS_1_3_VERSION,
+	.cipher_type = TLS_CIPHER_AES_CCM_128,
+};
+
 FIXTURE_SETUP(tls)
 {
 	struct tls_crypto_info_keys tls12;
-- 
cgit v1.2.3


From 13bf99ab2130783e2b1988ef415585e3af7df97b Mon Sep 17 00:00:00 2001
From: Vadim Fedorenko <vfedorenko@novek.ru>
Date: Tue, 7 Dec 2021 00:39:32 +0300
Subject: selftests: tls: add missing AES256-GCM cipher

Add tests for TLSv1.2 and TLSv1.3 with AES256-GCM cipher

Signed-off-by: Vadim Fedorenko <vfedorenko@novek.ru>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/tls.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c
index fb1bb402ee10..6e468e0f42f7 100644
--- a/tools/testing/selftests/net/tls.c
+++ b/tools/testing/selftests/net/tls.c
@@ -32,6 +32,7 @@ struct tls_crypto_info_keys {
 		struct tls12_crypto_info_sm4_gcm sm4gcm;
 		struct tls12_crypto_info_sm4_ccm sm4ccm;
 		struct tls12_crypto_info_aes_ccm_128 aesccm128;
+		struct tls12_crypto_info_aes_gcm_256 aesgcm256;
 	};
 	size_t len;
 };
@@ -67,6 +68,11 @@ static void tls_crypto_info_init(uint16_t tls_version, uint16_t cipher_type,
 		tls12->aesccm128.info.version = tls_version;
 		tls12->aesccm128.info.cipher_type = cipher_type;
 		break;
+	case TLS_CIPHER_AES_GCM_256:
+		tls12->len = sizeof(struct tls12_crypto_info_aes_gcm_256);
+		tls12->aesgcm256.info.version = tls_version;
+		tls12->aesgcm256.info.cipher_type = cipher_type;
+		break;
 	default:
 		break;
 	}
@@ -279,6 +285,18 @@ FIXTURE_VARIANT_ADD(tls, 13_aes_ccm)
 	.cipher_type = TLS_CIPHER_AES_CCM_128,
 };
 
+FIXTURE_VARIANT_ADD(tls, 12_aes_gcm_256)
+{
+	.tls_version = TLS_1_2_VERSION,
+	.cipher_type = TLS_CIPHER_AES_GCM_256,
+};
+
+FIXTURE_VARIANT_ADD(tls, 13_aes_gcm_256)
+{
+	.tls_version = TLS_1_3_VERSION,
+	.cipher_type = TLS_CIPHER_AES_GCM_256,
+};
+
 FIXTURE_SETUP(tls)
 {
 	struct tls_crypto_info_keys tls12;
-- 
cgit v1.2.3


From 11f8cb8903ba4e8ba900fa4e4ab29d0fb4c9ef5d Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Tue, 23 Nov 2021 21:23:30 +0800
Subject: ACPI: tools: Fix compilation when output directory is not present

Compiling the ACPI tools when output directory parameter is specified,
but the output directory is not present, triggers the following error:

make O=/data/test/tmp/ -C tools/power/acpi/

make: Entering directory '/data/src/kernel/linux/tools/power/acpi'
  DESCEND tools/acpidbg
make[1]: Entering directory '/data/src/kernel/linux/tools/power/acpi/tools/acpidbg'
  MKDIR    include
  CP       include
  CC       tools/acpidbg/acpidbg.o
Assembler messages:
Fatal error: can't create /data/test/tmp/tools/power/acpi/tools/acpidbg/acpidbg.o: No such file or directory
make[1]: *** [../../Makefile.rules:24: /data/test/tmp/tools/power/acpi/tools/acpidbg/acpidbg.o] Error 1
make[1]: Leaving directory '/data/src/kernel/linux/tools/power/acpi/tools/acpidbg'
make: *** [Makefile:18: acpidbg] Error 2
make: Leaving directory '/data/src/kernel/linux/tools/power/acpi'

which occurs because the output directory has not been created yet.

Fix this issue by creating the output directory before compiling.

Reported-by: Andy Shevchenko <andriy.shevchenko@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
[ rjw: New subject, changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 tools/power/acpi/Makefile.config | 1 +
 tools/power/acpi/Makefile.rules  | 1 +
 2 files changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/power/acpi/Makefile.config b/tools/power/acpi/Makefile.config
index 331f6d30f472..cd7106876a5f 100644
--- a/tools/power/acpi/Makefile.config
+++ b/tools/power/acpi/Makefile.config
@@ -69,6 +69,7 @@ KERNEL_INCLUDE := $(OUTPUT)include
 ACPICA_INCLUDE := $(srctree)/../../../drivers/acpi/acpica
 CFLAGS += -D_LINUX -I$(KERNEL_INCLUDE) -I$(ACPICA_INCLUDE)
 CFLAGS += $(WARNINGS)
+MKDIR = mkdir
 
 ifeq ($(strip $(V)),false)
 	QUIET=@
diff --git a/tools/power/acpi/Makefile.rules b/tools/power/acpi/Makefile.rules
index 2a6c170b57cd..1d7616f5d0ae 100644
--- a/tools/power/acpi/Makefile.rules
+++ b/tools/power/acpi/Makefile.rules
@@ -21,6 +21,7 @@ $(KERNEL_INCLUDE):
 
 $(objdir)%.o: %.c $(KERNEL_INCLUDE)
 	$(ECHO) "  CC      " $(subst $(OUTPUT),,$@)
+	$(QUIET) $(MKDIR) -p $(objdir) 2>/dev/null
 	$(QUIET) $(CC) -c $(CFLAGS) -o $@ $<
 
 all: $(OUTPUT)$(TOOL)
-- 
cgit v1.2.3


From b560b21f71eb4ef9dfc7c8ec1d0e4d7f9aa54b51 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Tue, 7 Dec 2021 10:15:21 +0200
Subject: bpf: Add selftests to cover packet access corner cases

This commit adds BPF verifier selftests that cover all corner cases by
packet boundary checks. Specifically, 8-byte packet reads are tested at
the beginning of data and at the beginning of data_meta, using all kinds
of boundary checks (all comparison operators: <, >, <=, >=; both
permutations of operands: data + length compared to end, end compared to
data + length). For each case there are three tests:

1. Length is just enough for an 8-byte read. Length is either 7 or 8,
   depending on the comparison.

2. Length is increased by 1 - should still pass the verifier. These
   cases are useful, because they failed before commit 2fa7d94afc1a
   ("bpf: Fix the off-by-two error in range markings").

3. Length is decreased by 1 - should be rejected by the verifier.

Some existing tests are just renamed to avoid duplication.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211207081521.41923-1-maximmi@nvidia.com
---
 .../bpf/verifier/xdp_direct_packet_access.c        | 600 ++++++++++++++++++++-
 1 file changed, 584 insertions(+), 16 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/verifier/xdp_direct_packet_access.c b/tools/testing/selftests/bpf/verifier/xdp_direct_packet_access.c
index de172a5b8754..b4ec228eb95d 100644
--- a/tools/testing/selftests/bpf/verifier/xdp_direct_packet_access.c
+++ b/tools/testing/selftests/bpf/verifier/xdp_direct_packet_access.c
@@ -35,7 +35,7 @@
 	.prog_type = BPF_PROG_TYPE_XDP,
 },
 {
-	"XDP pkt read, pkt_data' > pkt_end, good access",
+	"XDP pkt read, pkt_data' > pkt_end, corner case, good access",
 	.insns = {
 	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
 	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
@@ -87,6 +87,41 @@
 	.prog_type = BPF_PROG_TYPE_XDP,
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
+{
+	"XDP pkt read, pkt_data' > pkt_end, corner case +1, good access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+		    offsetof(struct xdp_md, data_end)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 9),
+	BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_3, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -9),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+	"XDP pkt read, pkt_data' > pkt_end, corner case -1, bad access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+		    offsetof(struct xdp_md, data_end)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+	BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_3, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.errstr = "R1 offset is outside of the packet",
+	.result = REJECT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
 {
 	"XDP pkt read, pkt_end > pkt_data', good access",
 	.insns = {
@@ -106,7 +141,7 @@
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
 {
-	"XDP pkt read, pkt_end > pkt_data', bad access 1",
+	"XDP pkt read, pkt_end > pkt_data', corner case -1, bad access",
 	.insns = {
 	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
 	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
@@ -142,6 +177,42 @@
 	.prog_type = BPF_PROG_TYPE_XDP,
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
+{
+	"XDP pkt read, pkt_end > pkt_data', corner case, good access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+		    offsetof(struct xdp_md, data_end)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+	BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_1, 1),
+	BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+	"XDP pkt read, pkt_end > pkt_data', corner case +1, good access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+		    offsetof(struct xdp_md, data_end)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+	BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_1, 1),
+	BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
 {
 	"XDP pkt read, pkt_data' < pkt_end, good access",
 	.insns = {
@@ -161,7 +232,7 @@
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
 {
-	"XDP pkt read, pkt_data' < pkt_end, bad access 1",
+	"XDP pkt read, pkt_data' < pkt_end, corner case -1, bad access",
 	.insns = {
 	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
 	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
@@ -198,7 +269,43 @@
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
 {
-	"XDP pkt read, pkt_end < pkt_data', good access",
+	"XDP pkt read, pkt_data' < pkt_end, corner case, good access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+		    offsetof(struct xdp_md, data_end)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+	BPF_JMP_REG(BPF_JLT, BPF_REG_1, BPF_REG_3, 1),
+	BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+	"XDP pkt read, pkt_data' < pkt_end, corner case +1, good access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+		    offsetof(struct xdp_md, data_end)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+	BPF_JMP_REG(BPF_JLT, BPF_REG_1, BPF_REG_3, 1),
+	BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+	"XDP pkt read, pkt_end < pkt_data', corner case, good access",
 	.insns = {
 	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
 	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
@@ -250,6 +357,41 @@
 	.prog_type = BPF_PROG_TYPE_XDP,
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
+{
+	"XDP pkt read, pkt_end < pkt_data', corner case +1, good access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+		    offsetof(struct xdp_md, data_end)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 9),
+	BPF_JMP_REG(BPF_JLT, BPF_REG_3, BPF_REG_1, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -9),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+	"XDP pkt read, pkt_end < pkt_data', corner case -1, bad access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+		    offsetof(struct xdp_md, data_end)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+	BPF_JMP_REG(BPF_JLT, BPF_REG_3, BPF_REG_1, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.errstr = "R1 offset is outside of the packet",
+	.result = REJECT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
 {
 	"XDP pkt read, pkt_data' >= pkt_end, good access",
 	.insns = {
@@ -268,7 +410,7 @@
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
 {
-	"XDP pkt read, pkt_data' >= pkt_end, bad access 1",
+	"XDP pkt read, pkt_data' >= pkt_end, corner case -1, bad access",
 	.insns = {
 	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
 	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
@@ -304,7 +446,41 @@
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
 {
-	"XDP pkt read, pkt_end >= pkt_data', good access",
+	"XDP pkt read, pkt_data' >= pkt_end, corner case, good access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+		    offsetof(struct xdp_md, data_end)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+	BPF_JMP_REG(BPF_JGE, BPF_REG_1, BPF_REG_3, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+	"XDP pkt read, pkt_data' >= pkt_end, corner case +1, good access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+		    offsetof(struct xdp_md, data_end)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+	BPF_JMP_REG(BPF_JGE, BPF_REG_1, BPF_REG_3, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+	"XDP pkt read, pkt_end >= pkt_data', corner case, good access",
 	.insns = {
 	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
 	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
@@ -359,7 +535,44 @@
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
 {
-	"XDP pkt read, pkt_data' <= pkt_end, good access",
+	"XDP pkt read, pkt_end >= pkt_data', corner case +1, good access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+		    offsetof(struct xdp_md, data_end)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 9),
+	BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_1, 1),
+	BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -9),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+	"XDP pkt read, pkt_end >= pkt_data', corner case -1, bad access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+		    offsetof(struct xdp_md, data_end)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+	BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_1, 1),
+	BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.errstr = "R1 offset is outside of the packet",
+	.result = REJECT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+	"XDP pkt read, pkt_data' <= pkt_end, corner case, good access",
 	.insns = {
 	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
 	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
@@ -413,6 +626,43 @@
 	.prog_type = BPF_PROG_TYPE_XDP,
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
+{
+	"XDP pkt read, pkt_data' <= pkt_end, corner case +1, good access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+		    offsetof(struct xdp_md, data_end)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 9),
+	BPF_JMP_REG(BPF_JLE, BPF_REG_1, BPF_REG_3, 1),
+	BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -9),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+	"XDP pkt read, pkt_data' <= pkt_end, corner case -1, bad access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+		    offsetof(struct xdp_md, data_end)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+	BPF_JMP_REG(BPF_JLE, BPF_REG_1, BPF_REG_3, 1),
+	BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.errstr = "R1 offset is outside of the packet",
+	.result = REJECT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
 {
 	"XDP pkt read, pkt_end <= pkt_data', good access",
 	.insns = {
@@ -431,7 +681,7 @@
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
 {
-	"XDP pkt read, pkt_end <= pkt_data', bad access 1",
+	"XDP pkt read, pkt_end <= pkt_data', corner case -1, bad access",
 	.insns = {
 	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
 	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
@@ -467,7 +717,41 @@
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
 {
-	"XDP pkt read, pkt_meta' > pkt_data, good access",
+	"XDP pkt read, pkt_end <= pkt_data', corner case, good access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+		    offsetof(struct xdp_md, data_end)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+	BPF_JMP_REG(BPF_JLE, BPF_REG_3, BPF_REG_1, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+	"XDP pkt read, pkt_end <= pkt_data', corner case +1, good access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+		    offsetof(struct xdp_md, data_end)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+	BPF_JMP_REG(BPF_JLE, BPF_REG_3, BPF_REG_1, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+	"XDP pkt read, pkt_meta' > pkt_data, corner case, good access",
 	.insns = {
 	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
 		    offsetof(struct xdp_md, data_meta)),
@@ -519,6 +803,41 @@
 	.prog_type = BPF_PROG_TYPE_XDP,
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
+{
+	"XDP pkt read, pkt_meta' > pkt_data, corner case +1, good access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+		    offsetof(struct xdp_md, data_meta)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 9),
+	BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_3, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -9),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+	"XDP pkt read, pkt_meta' > pkt_data, corner case -1, bad access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+		    offsetof(struct xdp_md, data_meta)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+	BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_3, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.errstr = "R1 offset is outside of the packet",
+	.result = REJECT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
 {
 	"XDP pkt read, pkt_data > pkt_meta', good access",
 	.insns = {
@@ -538,7 +857,7 @@
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
 {
-	"XDP pkt read, pkt_data > pkt_meta', bad access 1",
+	"XDP pkt read, pkt_data > pkt_meta', corner case -1, bad access",
 	.insns = {
 	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
 		    offsetof(struct xdp_md, data_meta)),
@@ -574,6 +893,42 @@
 	.prog_type = BPF_PROG_TYPE_XDP,
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
+{
+	"XDP pkt read, pkt_data > pkt_meta', corner case, good access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+		    offsetof(struct xdp_md, data_meta)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+	BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_1, 1),
+	BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+	"XDP pkt read, pkt_data > pkt_meta', corner case +1, good access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+		    offsetof(struct xdp_md, data_meta)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+	BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_1, 1),
+	BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
 {
 	"XDP pkt read, pkt_meta' < pkt_data, good access",
 	.insns = {
@@ -593,7 +948,7 @@
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
 {
-	"XDP pkt read, pkt_meta' < pkt_data, bad access 1",
+	"XDP pkt read, pkt_meta' < pkt_data, corner case -1, bad access",
 	.insns = {
 	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
 		    offsetof(struct xdp_md, data_meta)),
@@ -630,7 +985,43 @@
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
 {
-	"XDP pkt read, pkt_data < pkt_meta', good access",
+	"XDP pkt read, pkt_meta' < pkt_data, corner case, good access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+		    offsetof(struct xdp_md, data_meta)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+	BPF_JMP_REG(BPF_JLT, BPF_REG_1, BPF_REG_3, 1),
+	BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+	"XDP pkt read, pkt_meta' < pkt_data, corner case +1, good access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+		    offsetof(struct xdp_md, data_meta)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+	BPF_JMP_REG(BPF_JLT, BPF_REG_1, BPF_REG_3, 1),
+	BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+	"XDP pkt read, pkt_data < pkt_meta', corner case, good access",
 	.insns = {
 	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
 		    offsetof(struct xdp_md, data_meta)),
@@ -682,6 +1073,41 @@
 	.prog_type = BPF_PROG_TYPE_XDP,
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
+{
+	"XDP pkt read, pkt_data < pkt_meta', corner case +1, good access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+		    offsetof(struct xdp_md, data_meta)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 9),
+	BPF_JMP_REG(BPF_JLT, BPF_REG_3, BPF_REG_1, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -9),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+	"XDP pkt read, pkt_data < pkt_meta', corner case -1, bad access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+		    offsetof(struct xdp_md, data_meta)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+	BPF_JMP_REG(BPF_JLT, BPF_REG_3, BPF_REG_1, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.errstr = "R1 offset is outside of the packet",
+	.result = REJECT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
 {
 	"XDP pkt read, pkt_meta' >= pkt_data, good access",
 	.insns = {
@@ -700,7 +1126,7 @@
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
 {
-	"XDP pkt read, pkt_meta' >= pkt_data, bad access 1",
+	"XDP pkt read, pkt_meta' >= pkt_data, corner case -1, bad access",
 	.insns = {
 	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
 		    offsetof(struct xdp_md, data_meta)),
@@ -736,7 +1162,41 @@
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
 {
-	"XDP pkt read, pkt_data >= pkt_meta', good access",
+	"XDP pkt read, pkt_meta' >= pkt_data, corner case, good access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+		    offsetof(struct xdp_md, data_meta)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+	BPF_JMP_REG(BPF_JGE, BPF_REG_1, BPF_REG_3, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+	"XDP pkt read, pkt_meta' >= pkt_data, corner case +1, good access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+		    offsetof(struct xdp_md, data_meta)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+	BPF_JMP_REG(BPF_JGE, BPF_REG_1, BPF_REG_3, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+	"XDP pkt read, pkt_data >= pkt_meta', corner case, good access",
 	.insns = {
 	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
 		    offsetof(struct xdp_md, data_meta)),
@@ -791,7 +1251,44 @@
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
 {
-	"XDP pkt read, pkt_meta' <= pkt_data, good access",
+	"XDP pkt read, pkt_data >= pkt_meta', corner case +1, good access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+		    offsetof(struct xdp_md, data_meta)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 9),
+	BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_1, 1),
+	BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -9),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+	"XDP pkt read, pkt_data >= pkt_meta', corner case -1, bad access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+		    offsetof(struct xdp_md, data_meta)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+	BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_1, 1),
+	BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.errstr = "R1 offset is outside of the packet",
+	.result = REJECT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+	"XDP pkt read, pkt_meta' <= pkt_data, corner case, good access",
 	.insns = {
 	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
 		    offsetof(struct xdp_md, data_meta)),
@@ -845,6 +1342,43 @@
 	.prog_type = BPF_PROG_TYPE_XDP,
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
+{
+	"XDP pkt read, pkt_meta' <= pkt_data, corner case +1, good access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+		    offsetof(struct xdp_md, data_meta)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 9),
+	BPF_JMP_REG(BPF_JLE, BPF_REG_1, BPF_REG_3, 1),
+	BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -9),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+	"XDP pkt read, pkt_meta' <= pkt_data, corner case -1, bad access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+		    offsetof(struct xdp_md, data_meta)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+	BPF_JMP_REG(BPF_JLE, BPF_REG_1, BPF_REG_3, 1),
+	BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.errstr = "R1 offset is outside of the packet",
+	.result = REJECT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
 {
 	"XDP pkt read, pkt_data <= pkt_meta', good access",
 	.insns = {
@@ -863,7 +1397,7 @@
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
 {
-	"XDP pkt read, pkt_data <= pkt_meta', bad access 1",
+	"XDP pkt read, pkt_data <= pkt_meta', corner case -1, bad access",
 	.insns = {
 	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
 		    offsetof(struct xdp_md, data_meta)),
@@ -898,3 +1432,37 @@
 	.prog_type = BPF_PROG_TYPE_XDP,
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
+{
+	"XDP pkt read, pkt_data <= pkt_meta', corner case, good access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+		    offsetof(struct xdp_md, data_meta)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+	BPF_JMP_REG(BPF_JLE, BPF_REG_3, BPF_REG_1, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+	"XDP pkt read, pkt_data <= pkt_meta', corner case +1, good access",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+		    offsetof(struct xdp_md, data_meta)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+	BPF_JMP_REG(BPF_JLE, BPF_REG_3, BPF_REG_1, 1),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
-- 
cgit v1.2.3


From 1cc1e4c8aab4213bd4e6353dec2620476a233d6d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 4 Dec 2021 14:43:42 +0100
Subject: objtool: Add straight-line-speculation validation

Teach objtool to validate the straight-line-speculation constraints:

 - speculation trap after indirect calls
 - speculation trap after RET

Notable: when an instruction is annotated RETPOLINE_SAFE, indicating
  speculation isn't a problem, also don't care about sls for that
  instruction.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20211204134908.023037659@infradead.org
---
 tools/objtool/arch/x86/decode.c         | 13 +++++++++----
 tools/objtool/builtin-check.c           |  3 ++-
 tools/objtool/check.c                   | 14 ++++++++++++++
 tools/objtool/include/objtool/arch.h    |  1 +
 tools/objtool/include/objtool/builtin.h |  2 +-
 5 files changed, 27 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index 4d6d7fc13255..c10ef78df050 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -531,6 +531,11 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 		}
 		break;
 
+	case 0xcc:
+		/* int3 */
+		*type = INSN_TRAP;
+		break;
+
 	case 0xe3:
 		/* jecxz/jrcxz */
 		*type = INSN_JUMP_CONDITIONAL;
@@ -697,10 +702,10 @@ const char *arch_ret_insn(int len)
 {
 	static const char ret[5][5] = {
 		{ BYTE_RET },
-		{ BYTE_RET, BYTES_NOP1 },
-		{ BYTE_RET, BYTES_NOP2 },
-		{ BYTE_RET, BYTES_NOP3 },
-		{ BYTE_RET, BYTES_NOP4 },
+		{ BYTE_RET, 0xcc },
+		{ BYTE_RET, 0xcc, BYTES_NOP1 },
+		{ BYTE_RET, 0xcc, BYTES_NOP2 },
+		{ BYTE_RET, 0xcc, BYTES_NOP3 },
 	};
 
 	if (len < 1 || len > 5) {
diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c
index 8b38b5d6fec7..38070f26105b 100644
--- a/tools/objtool/builtin-check.c
+++ b/tools/objtool/builtin-check.c
@@ -20,7 +20,7 @@
 #include <objtool/objtool.h>
 
 bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats,
-     validate_dup, vmlinux, mcount, noinstr, backup;
+     validate_dup, vmlinux, mcount, noinstr, backup, sls;
 
 static const char * const check_usage[] = {
 	"objtool check [<options>] file.o",
@@ -45,6 +45,7 @@ const struct option check_options[] = {
 	OPT_BOOLEAN('l', "vmlinux", &vmlinux, "vmlinux.o validation"),
 	OPT_BOOLEAN('M', "mcount", &mcount, "generate __mcount_loc"),
 	OPT_BOOLEAN('B', "backup", &backup, "create .orig files before modification"),
+	OPT_BOOLEAN('S', "sls", &sls, "validate straight-line-speculation"),
 	OPT_END(),
 };
 
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 21735829b860..e28172f6e792 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -3084,6 +3084,12 @@ static int validate_branch(struct objtool_file *file, struct symbol *func,
 		switch (insn->type) {
 
 		case INSN_RETURN:
+			if (next_insn && next_insn->type == INSN_TRAP) {
+				next_insn->ignore = true;
+			} else if (sls && !insn->retpoline_safe) {
+				WARN_FUNC("missing int3 after ret",
+					  insn->sec, insn->offset);
+			}
 			return validate_return(func, insn, &state);
 
 		case INSN_CALL:
@@ -3127,6 +3133,14 @@ static int validate_branch(struct objtool_file *file, struct symbol *func,
 			break;
 
 		case INSN_JUMP_DYNAMIC:
+			if (next_insn && next_insn->type == INSN_TRAP) {
+				next_insn->ignore = true;
+			} else if (sls && !insn->retpoline_safe) {
+				WARN_FUNC("missing int3 after indirect jump",
+					  insn->sec, insn->offset);
+			}
+
+			/* fallthrough */
 		case INSN_JUMP_DYNAMIC_CONDITIONAL:
 			if (is_sibling_call(insn)) {
 				ret = validate_sibling_call(file, insn, &state);
diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h
index 589ff58426ab..76bae3078286 100644
--- a/tools/objtool/include/objtool/arch.h
+++ b/tools/objtool/include/objtool/arch.h
@@ -26,6 +26,7 @@ enum insn_type {
 	INSN_CLAC,
 	INSN_STD,
 	INSN_CLD,
+	INSN_TRAP,
 	INSN_OTHER,
 };
 
diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h
index 15ac0b7d3d6a..89ba869ed08f 100644
--- a/tools/objtool/include/objtool/builtin.h
+++ b/tools/objtool/include/objtool/builtin.h
@@ -9,7 +9,7 @@
 
 extern const struct option check_options[];
 extern bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats,
-            validate_dup, vmlinux, mcount, noinstr, backup;
+            validate_dup, vmlinux, mcount, noinstr, backup, sls;
 
 extern int cmd_parse_options(int argc, const char **argv, const char * const usage[]);
 
-- 
cgit v1.2.3


From 8d0f9e73efe7f0ff4b1d0d013044e8ef91782689 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Tue, 7 Dec 2021 15:23:40 -0800
Subject: perf/bpf_counter: Use bpf_map_create instead of bpf_create_map

bpf_create_map is deprecated. Replace it with bpf_map_create. Also add a
__weak bpf_map_create() so that when older version of libbpf is linked as
a shared library, it falls back to bpf_create_map().

Fixes: 992c4225419a ("libbpf: Unify low-level map creation APIs w/ new bpf_map_create()")
Signed-off-by: Song Liu <song@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211207232340.2561471-1-song@kernel.org
---
 tools/perf/util/bpf_counter.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c
index c17d4a43ce06..5a97fd7d0a71 100644
--- a/tools/perf/util/bpf_counter.c
+++ b/tools/perf/util/bpf_counter.c
@@ -307,6 +307,20 @@ static bool bperf_attr_map_compatible(int attr_map_fd)
 		(map_info.value_size == sizeof(struct perf_event_attr_map_entry));
 }
 
+int __weak
+bpf_map_create(enum bpf_map_type map_type,
+	       const char *map_name __maybe_unused,
+	       __u32 key_size,
+	       __u32 value_size,
+	       __u32 max_entries,
+	       const struct bpf_map_create_opts *opts __maybe_unused)
+{
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+	return bpf_create_map(map_type, key_size, value_size, max_entries, 0);
+#pragma GCC diagnostic pop
+}
+
 static int bperf_lock_attr_map(struct target *target)
 {
 	char path[PATH_MAX];
@@ -320,10 +334,10 @@ static int bperf_lock_attr_map(struct target *target)
 	}
 
 	if (access(path, F_OK)) {
-		map_fd = bpf_create_map(BPF_MAP_TYPE_HASH,
+		map_fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL,
 					sizeof(struct perf_event_attr),
 					sizeof(struct perf_event_attr_map_entry),
-					ATTR_MAP_SIZE, 0);
+					ATTR_MAP_SIZE, NULL);
 		if (map_fd < 0)
 			return -1;
 
-- 
cgit v1.2.3


From b540358e6c4d86eb450f3539aea198653e656641 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 8 Dec 2021 21:04:03 -0800
Subject: selftests/bpf: Fix a compilation warning

The following warning is triggered when I used clang compiler
to build the selftest.

  /.../prog_tests/btf_dedup_split.c:368:6: warning: variable 'btf2' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized]
        if (!ASSERT_OK(err, "btf_dedup"))
            ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
  /.../prog_tests/btf_dedup_split.c:424:12: note: uninitialized use occurs here
        btf__free(btf2);
                  ^~~~
  /.../prog_tests/btf_dedup_split.c:368:2: note: remove the 'if' if its condition is always false
        if (!ASSERT_OK(err, "btf_dedup"))
        ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  /.../prog_tests/btf_dedup_split.c:343:25: note: initialize the variable 'btf2' to silence this warning
        struct btf *btf1, *btf2;
                               ^
                                = NULL

Initialize local variable btf2 = NULL and the warning is gone.

Fixes: 9a49afe6f5a5 ("selftests/bpf: Add btf_dedup case with duplicated structs within CU")
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211209050403.1770836-1-yhs@fb.com
---
 tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c b/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c
index 878a864dae3b..90aac437576d 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c
@@ -340,7 +340,7 @@ static void btf_add_dup_struct_in_cu(struct btf *btf, int start_id)
 
 static void test_split_dup_struct_in_cu()
 {
-	struct btf *btf1, *btf2;
+	struct btf *btf1, *btf2 = NULL;
 	int err;
 
 	/* generate the base data.. */
-- 
cgit v1.2.3


From 3c42e9542050d49610077e083c7c3f5fd5e26820 Mon Sep 17 00:00:00 2001
From: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
Date: Tue, 7 Dec 2021 10:05:57 -0300
Subject: selftests/powerpc/spectre_v2: Return skip code when miss_percent is
 high

A mis-match between reported and actual mitigation is not restricted to the
Vulnerable case. The guest might also report the mitigation as "Software
count cache flush" and the host will still mitigate with branch cache
disabled.

So, instead of skipping depending on the detected mitigation, simply skip
whenever the detected miss_percent is the expected one for a fully
mitigated system, that is, above 95%.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20211207130557.40566-1-cascardo@canonical.com
---
 tools/testing/selftests/powerpc/security/spectre_v2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/powerpc/security/spectre_v2.c b/tools/testing/selftests/powerpc/security/spectre_v2.c
index adc2b7294e5f..83647b8277e7 100644
--- a/tools/testing/selftests/powerpc/security/spectre_v2.c
+++ b/tools/testing/selftests/powerpc/security/spectre_v2.c
@@ -193,7 +193,7 @@ int spectre_v2_test(void)
 			 * We are not vulnerable and reporting otherwise, so
 			 * missing such a mismatch is safe.
 			 */
-			if (state == VULNERABLE)
+			if (miss_percent > 95)
 				return 4;
 
 			return 1;
-- 
cgit v1.2.3


From 3a49cc22d31eccceb856f468be0646faa2d4643f Mon Sep 17 00:00:00 2001
From: Sasha Levin <sashal@kernel.org>
Date: Thu, 9 Dec 2021 11:51:13 -0500
Subject: tools/lib/lockdep: drop leftover liblockdep headers

Clean up remaining headers that are specific to liblockdep but lived in
the shared header directory.  These are all unused after the liblockdep
code was removed in commit 7246f4dcaccc ("tools/lib/lockdep: drop
liblockdep").

Note that there are still headers that were originally created for
liblockdep, that still have liblockdep references, but they are used by
other tools/ code at this point.

Signed-off-by: Sasha Levin <sashal@kernel.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/include/linux/debug_locks.h | 14 --------
 tools/include/linux/hardirq.h     | 12 -------
 tools/include/linux/irqflags.h    | 39 ---------------------
 tools/include/linux/lockdep.h     | 72 ---------------------------------------
 tools/include/linux/proc_fs.h     |  4 ---
 tools/include/linux/spinlock.h    |  2 --
 tools/include/linux/stacktrace.h  | 33 ------------------
 7 files changed, 176 deletions(-)
 delete mode 100644 tools/include/linux/debug_locks.h
 delete mode 100644 tools/include/linux/hardirq.h
 delete mode 100644 tools/include/linux/irqflags.h
 delete mode 100644 tools/include/linux/lockdep.h
 delete mode 100644 tools/include/linux/proc_fs.h
 delete mode 100644 tools/include/linux/stacktrace.h

(limited to 'tools')

diff --git a/tools/include/linux/debug_locks.h b/tools/include/linux/debug_locks.h
deleted file mode 100644
index 72d595ce764a..000000000000
--- a/tools/include/linux/debug_locks.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LIBLOCKDEP_DEBUG_LOCKS_H_
-#define _LIBLOCKDEP_DEBUG_LOCKS_H_
-
-#include <stddef.h>
-#include <linux/compiler.h>
-#include <asm/bug.h>
-
-#define DEBUG_LOCKS_WARN_ON(x) WARN_ON(x)
-
-extern bool debug_locks;
-extern bool debug_locks_silent;
-
-#endif
diff --git a/tools/include/linux/hardirq.h b/tools/include/linux/hardirq.h
deleted file mode 100644
index b25580b6a9be..000000000000
--- a/tools/include/linux/hardirq.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LIBLOCKDEP_LINUX_HARDIRQ_H_
-#define _LIBLOCKDEP_LINUX_HARDIRQ_H_
-
-#define SOFTIRQ_BITS	0UL
-#define HARDIRQ_BITS	0UL
-#define SOFTIRQ_SHIFT	0UL
-#define HARDIRQ_SHIFT	0UL
-#define hardirq_count()	0UL
-#define softirq_count()	0UL
-
-#endif
diff --git a/tools/include/linux/irqflags.h b/tools/include/linux/irqflags.h
deleted file mode 100644
index 501262aee8ff..000000000000
--- a/tools/include/linux/irqflags.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LIBLOCKDEP_LINUX_TRACE_IRQFLAGS_H_
-#define _LIBLOCKDEP_LINUX_TRACE_IRQFLAGS_H_
-
-# define lockdep_hardirq_context()	0
-# define lockdep_softirq_context(p)	0
-# define lockdep_hardirqs_enabled()	0
-# define lockdep_softirqs_enabled(p)	0
-# define lockdep_hardirq_enter()	do { } while (0)
-# define lockdep_hardirq_exit()		do { } while (0)
-# define lockdep_softirq_enter()	do { } while (0)
-# define lockdep_softirq_exit()		do { } while (0)
-# define INIT_TRACE_IRQFLAGS
-
-# define stop_critical_timings() do { } while (0)
-# define start_critical_timings() do { } while (0)
-
-#define raw_local_irq_disable() do { } while (0)
-#define raw_local_irq_enable() do { } while (0)
-#define raw_local_irq_save(flags) ((flags) = 0)
-#define raw_local_irq_restore(flags) ((void)(flags))
-#define raw_local_save_flags(flags) ((flags) = 0)
-#define raw_irqs_disabled_flags(flags) ((void)(flags))
-#define raw_irqs_disabled() 0
-#define raw_safe_halt()
-
-#define local_irq_enable() do { } while (0)
-#define local_irq_disable() do { } while (0)
-#define local_irq_save(flags) ((flags) = 0)
-#define local_irq_restore(flags) ((void)(flags))
-#define local_save_flags(flags)	((flags) = 0)
-#define irqs_disabled() (1)
-#define irqs_disabled_flags(flags) ((void)(flags), 0)
-#define safe_halt() do { } while (0)
-
-#define trace_lock_release(x, y)
-#define trace_lock_acquire(a, b, c, d, e, f, g)
-
-#endif
diff --git a/tools/include/linux/lockdep.h b/tools/include/linux/lockdep.h
deleted file mode 100644
index e56997288f2b..000000000000
--- a/tools/include/linux/lockdep.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LIBLOCKDEP_LOCKDEP_H_
-#define _LIBLOCKDEP_LOCKDEP_H_
-
-#include <sys/prctl.h>
-#include <sys/syscall.h>
-#include <string.h>
-#include <limits.h>
-#include <linux/utsname.h>
-#include <linux/compiler.h>
-#include <linux/export.h>
-#include <linux/kern_levels.h>
-#include <linux/err.h>
-#include <linux/rcu.h>
-#include <linux/list.h>
-#include <linux/hardirq.h>
-#include <unistd.h>
-
-#define MAX_LOCK_DEPTH 63UL
-
-#define asmlinkage
-#define __visible
-
-#include "../../../include/linux/lockdep.h"
-
-struct task_struct {
-	u64 curr_chain_key;
-	int lockdep_depth;
-	unsigned int lockdep_recursion;
-	struct held_lock held_locks[MAX_LOCK_DEPTH];
-	gfp_t lockdep_reclaim_gfp;
-	int pid;
-	int state;
-	char comm[17];
-};
-
-#define TASK_RUNNING 0
-
-extern struct task_struct *__curr(void);
-
-#define current (__curr())
-
-static inline int debug_locks_off(void)
-{
-	return 1;
-}
-
-#define task_pid_nr(tsk) ((tsk)->pid)
-
-#define KSYM_NAME_LEN 128
-#define printk(...) dprintf(STDOUT_FILENO, __VA_ARGS__)
-#define pr_err(format, ...) fprintf (stderr, format, ## __VA_ARGS__)
-#define pr_warn pr_err
-#define pr_cont pr_err
-
-#define list_del_rcu list_del
-
-#define atomic_t unsigned long
-#define atomic_inc(x) ((*(x))++)
-
-#define print_tainted() ""
-#define static_obj(x) 1
-
-#define debug_show_all_locks()
-extern void debug_check_no_locks_held(void);
-
-static __used bool __is_kernel_percpu_address(unsigned long addr, void *can_addr)
-{
-	return false;
-}
-
-#endif
diff --git a/tools/include/linux/proc_fs.h b/tools/include/linux/proc_fs.h
deleted file mode 100644
index 8b3b03b64fda..000000000000
--- a/tools/include/linux/proc_fs.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef _TOOLS_INCLUDE_LINUX_PROC_FS_H
-#define _TOOLS_INCLUDE_LINUX_PROC_FS_H
-
-#endif /* _TOOLS_INCLUDE_LINUX_PROC_FS_H */
diff --git a/tools/include/linux/spinlock.h b/tools/include/linux/spinlock.h
index c934572d935c..622266b197d0 100644
--- a/tools/include/linux/spinlock.h
+++ b/tools/include/linux/spinlock.h
@@ -37,6 +37,4 @@ static inline bool arch_spin_is_locked(arch_spinlock_t *mutex)
 	return true;
 }
 
-#include <linux/lockdep.h>
-
 #endif
diff --git a/tools/include/linux/stacktrace.h b/tools/include/linux/stacktrace.h
deleted file mode 100644
index ae343ac35bfa..000000000000
--- a/tools/include/linux/stacktrace.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LIBLOCKDEP_LINUX_STACKTRACE_H_
-#define _LIBLOCKDEP_LINUX_STACKTRACE_H_
-
-#include <execinfo.h>
-
-struct stack_trace {
-	unsigned int nr_entries, max_entries;
-	unsigned long *entries;
-	int skip;
-};
-
-static inline void print_stack_trace(struct stack_trace *trace, int spaces)
-{
-	backtrace_symbols_fd((void **)trace->entries, trace->nr_entries, 1);
-}
-
-#define save_stack_trace(trace)	\
-	((trace)->nr_entries =	\
-		backtrace((void **)(trace)->entries, (trace)->max_entries))
-
-static inline int dump_stack(void)
-{
-	void *array[64];
-	size_t size;
-
-	size = backtrace(array, 64);
-	backtrace_symbols_fd(array, size, 1);
-
-	return 0;
-}
-
-#endif
-- 
cgit v1.2.3


From ee3a4f666207b5a9d3d4bc7f45c9d59f2aeb3a0d Mon Sep 17 00:00:00 2001
From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
Date: Fri, 3 Dec 2021 00:10:13 +0100
Subject: KVM: x86: selftests: svm_int_ctl_test: fix intercept calculation

INTERCEPT_x are bit positions, but the code was using the raw value of
INTERCEPT_VINTR (4) instead of BIT(INTERCEPT_VINTR).
This resulted in masking of bit 2 - that is, SMI instead of VINTR.

Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
Message-Id: <49b9571d25588870db5380b0be1a41df4bbaaf93.1638486479.git.maciej.szmigiero@oracle.com>
---
 tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c b/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c
index df04f56ce859..30a81038df46 100644
--- a/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c
+++ b/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c
@@ -75,7 +75,7 @@ static void l1_guest_code(struct svm_test_data *svm)
 	vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
 
 	/* No intercepts for real and virtual interrupts */
-	vmcb->control.intercept &= ~(1ULL << INTERCEPT_INTR | INTERCEPT_VINTR);
+	vmcb->control.intercept &= ~(BIT(INTERCEPT_INTR) | BIT(INTERCEPT_VINTR));
 
 	/* Make a virtual interrupt VINTR_IRQ_NUMBER pending */
 	vmcb->control.int_ctl |= V_IRQ_MASK | (0x1 << V_INTR_PRIO_SHIFT);
-- 
cgit v1.2.3


From 0525bd82f6a9b5860ed3ea53520ac11ebfa09d5b Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Tue, 30 Nov 2021 12:44:30 +0100
Subject: objtool, kcsan: Add memory barrier instrumentation to whitelist

Adds KCSAN's memory barrier instrumentation to objtool's uaccess
whitelist.

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/objtool/check.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 21735829b860..61dfb66b30b6 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -849,6 +849,10 @@ static const char *uaccess_safe_builtin[] = {
 	"__asan_report_store16_noabort",
 	/* KCSAN */
 	"__kcsan_check_access",
+	"__kcsan_mb",
+	"__kcsan_wmb",
+	"__kcsan_rmb",
+	"__kcsan_release",
 	"kcsan_found_watchpoint",
 	"kcsan_setup_watchpoint",
 	"kcsan_check_scoped_accesses",
-- 
cgit v1.2.3


From 0509811952e41ad1e04c50d2378078250b6b7be3 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Tue, 30 Nov 2021 12:44:31 +0100
Subject: objtool, kcsan: Remove memory barrier instrumentation from noinstr

Teach objtool to turn instrumentation required for memory barrier
modeling into nops in noinstr text.

The __tsan_func_entry/exit calls are still emitted by compilers even
with the __no_sanitize_thread attribute. The memory barrier
instrumentation will be inserted explicitly (without compiler help), and
thus needs to also explicitly be removed.

Signed-off-by: Marco Elver <elver@google.com>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/objtool/check.c               | 37 +++++++++++++++++++++++++++++++------
 tools/objtool/include/objtool/elf.h |  2 +-
 2 files changed, 32 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 61dfb66b30b6..a9a1f7259d62 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -1072,11 +1072,11 @@ static void annotate_call_site(struct objtool_file *file,
 	}
 
 	/*
-	 * Many compilers cannot disable KCOV with a function attribute
-	 * so they need a little help, NOP out any KCOV calls from noinstr
-	 * text.
+	 * Many compilers cannot disable KCOV or sanitizer calls with a function
+	 * attribute so they need a little help, NOP out any such calls from
+	 * noinstr text.
 	 */
-	if (insn->sec->noinstr && sym->kcov) {
+	if (insn->sec->noinstr && sym->profiling_func) {
 		if (reloc) {
 			reloc->type = R_NONE;
 			elf_write_reloc(file->elf, reloc);
@@ -1991,6 +1991,31 @@ static int read_intra_function_calls(struct objtool_file *file)
 	return 0;
 }
 
+/*
+ * Return true if name matches an instrumentation function, where calls to that
+ * function from noinstr code can safely be removed, but compilers won't do so.
+ */
+static bool is_profiling_func(const char *name)
+{
+	/*
+	 * Many compilers cannot disable KCOV with a function attribute.
+	 */
+	if (!strncmp(name, "__sanitizer_cov_", 16))
+		return true;
+
+	/*
+	 * Some compilers currently do not remove __tsan_func_entry/exit nor
+	 * __tsan_atomic_signal_fence (used for barrier instrumentation) with
+	 * the __no_sanitize_thread attribute, remove them. Once the kernel's
+	 * minimum Clang version is 14.0, this can be removed.
+	 */
+	if (!strncmp(name, "__tsan_func_", 12) ||
+	    !strcmp(name, "__tsan_atomic_signal_fence"))
+		return true;
+
+	return false;
+}
+
 static int classify_symbols(struct objtool_file *file)
 {
 	struct section *sec;
@@ -2011,8 +2036,8 @@ static int classify_symbols(struct objtool_file *file)
 			if (!strcmp(func->name, "__fentry__"))
 				func->fentry = true;
 
-			if (!strncmp(func->name, "__sanitizer_cov_", 16))
-				func->kcov = true;
+			if (is_profiling_func(func->name))
+				func->profiling_func = true;
 		}
 	}
 
diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h
index cdc739fa9a6f..d22336781401 100644
--- a/tools/objtool/include/objtool/elf.h
+++ b/tools/objtool/include/objtool/elf.h
@@ -58,7 +58,7 @@ struct symbol {
 	u8 static_call_tramp : 1;
 	u8 retpoline_thunk   : 1;
 	u8 fentry            : 1;
-	u8 kcov              : 1;
+	u8 profiling_func    : 1;
 	struct list_head pv_target;
 };
 
-- 
cgit v1.2.3


From 92816e2629808726af015c7f5b14adc8e4f8b147 Mon Sep 17 00:00:00 2001
From: Jie2x Zhou <jie2x.zhou@intel.com>
Date: Thu, 9 Dec 2021 10:02:30 +0800
Subject: selftests: net: Correct ping6 expected rc from 2 to 1

./fcnal-test.sh -v -t ipv6_ping
TEST: ping out, VRF bind - ns-B IPv6 LLA                                      [FAIL]
TEST: ping out, VRF bind - multicast IP                                       [FAIL]

ping6 is failing as it should.
COMMAND: ip netns exec ns-A /bin/ping6 -c1 -w1 fe80::7c4c:bcff:fe66:a63a%red
strace of ping6 shows it is failing with '1',
so change the expected rc from 2 to 1.

Fixes: c0644e71df33 ("selftests: Add ipv6 ping tests to fcnal-test")
Reported-by: kernel test robot <lkp@intel.com>
Suggested-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Jie2x Zhou <jie2x.zhou@intel.com>
Link: https://lore.kernel.org/r/20211209020230.37270-1-jie2x.zhou@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/fcnal-test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh
index a1da013d847b..29cc72d7c3d0 100755
--- a/tools/testing/selftests/net/fcnal-test.sh
+++ b/tools/testing/selftests/net/fcnal-test.sh
@@ -2191,7 +2191,7 @@ ipv6_ping_vrf()
 		log_start
 		show_hint "Fails since VRF device does not support linklocal or multicast"
 		run_cmd ${ping6} -c1 -w1 ${a}
-		log_test_addr ${a} $? 2 "ping out, VRF bind"
+		log_test_addr ${a} $? 1 "ping out, VRF bind"
 	done
 
 	for a in ${NSB_IP6} ${NSB_LO_IP6} ${NSB_LINKIP6}%${NSA_DEV} ${MCAST}%${NSA_DEV}
-- 
cgit v1.2.3


From c8cc43c1eae2910ac96daa4216e0fb3391ad0504 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 5 Aug 2021 06:54:23 -0400
Subject: selftests: KVM: avoid failures due to reserved HyperTransport region

AMD proceessors define an address range that is reserved by HyperTransport
and causes a failure if used for guest physical addresses.  Avoid
selftests failures by reserving those guest physical addresses; the
rules are:

- On parts with <40 bits, its fully hidden from software.

- Before Fam17h, it was always 12G just below 1T, even if there was more
RAM above this location.  In this case we just not use any RAM above 1T.

- On Fam17h and later, it is variable based on SME, and is either just
below 2^48 (no encryption) or 2^43 (encryption).

Fixes: ef4c9f4f6546 ("KVM: selftests: Fix 32-bit truncation of vm_get_max_gfn()")
Cc: stable@vger.kernel.org
Cc: David Matlack <dmatlack@google.com>
Reported-by: Maxim Levitsky <mlevitsk@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20210805105423.412878-1-pbonzini@redhat.com>
Reviewed-by: Sean Christopherson <seanjc@google.com>
Tested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/include/kvm_util.h     |  9 +++
 tools/testing/selftests/kvm/lib/kvm_util.c         |  2 +-
 tools/testing/selftests/kvm/lib/x86_64/processor.c | 68 ++++++++++++++++++++++
 3 files changed, 78 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 6a1a37f30494..da2b702da71a 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -71,6 +71,15 @@ enum vm_guest_mode {
 
 #endif
 
+#if defined(__x86_64__)
+unsigned long vm_compute_max_gfn(struct kvm_vm *vm);
+#else
+static inline unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
+{
+	return ((1ULL << vm->pa_bits) >> vm->page_shift) - 1;
+}
+#endif
+
 #define MIN_PAGE_SIZE		(1U << MIN_PAGE_SHIFT)
 #define PTES_PER_MIN_PAGE	ptes_per_page(MIN_PAGE_SIZE)
 
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 8f2e0bb1ef96..daf6fdb217a7 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -302,7 +302,7 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
 		(1ULL << (vm->va_bits - 1)) >> vm->page_shift);
 
 	/* Limit physical addresses to PA-bits. */
-	vm->max_gfn = ((1ULL << vm->pa_bits) >> vm->page_shift) - 1;
+	vm->max_gfn = vm_compute_max_gfn(vm);
 
 	/* Allocate and setup memory for guest. */
 	vm->vpages_mapped = sparsebit_alloc();
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index 82c39db91369..eef7b34756d5 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -1431,3 +1431,71 @@ struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpui
 
 	return cpuid;
 }
+
+#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541
+#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163
+#define X86EMUL_CPUID_VENDOR_AuthenticAMD_edx 0x69746e65
+
+static inline unsigned x86_family(unsigned int eax)
+{
+        unsigned int x86;
+
+        x86 = (eax >> 8) & 0xf;
+
+        if (x86 == 0xf)
+                x86 += (eax >> 20) & 0xff;
+
+        return x86;
+}
+
+unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
+{
+	const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */
+	unsigned long ht_gfn, max_gfn, max_pfn;
+	uint32_t eax, ebx, ecx, edx, max_ext_leaf;
+
+	max_gfn = (1ULL << (vm->pa_bits - vm->page_shift)) - 1;
+
+	/* Avoid reserved HyperTransport region on AMD processors.  */
+	eax = ecx = 0;
+	cpuid(&eax, &ebx, &ecx, &edx);
+	if (ebx != X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx ||
+	    ecx != X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx ||
+	    edx != X86EMUL_CPUID_VENDOR_AuthenticAMD_edx)
+		return max_gfn;
+
+	/* On parts with <40 physical address bits, the area is fully hidden */
+	if (vm->pa_bits < 40)
+		return max_gfn;
+
+	/* Before family 17h, the HyperTransport area is just below 1T.  */
+	ht_gfn = (1 << 28) - num_ht_pages;
+	eax = 1;
+	cpuid(&eax, &ebx, &ecx, &edx);
+	if (x86_family(eax) < 0x17)
+		goto done;
+
+	/*
+	 * Otherwise it's at the top of the physical address space, possibly
+	 * reduced due to SME by bits 11:6 of CPUID[0x8000001f].EBX.  Use
+	 * the old conservative value if MAXPHYADDR is not enumerated.
+	 */
+	eax = 0x80000000;
+	cpuid(&eax, &ebx, &ecx, &edx);
+	max_ext_leaf = eax;
+	if (max_ext_leaf < 0x80000008)
+		goto done;
+
+	eax = 0x80000008;
+	cpuid(&eax, &ebx, &ecx, &edx);
+	max_pfn = (1ULL << ((eax & 0xff) - vm->page_shift)) - 1;
+	if (max_ext_leaf >= 0x8000001f) {
+		eax = 0x8000001f;
+		cpuid(&eax, &ebx, &ecx, &edx);
+		max_pfn >>= (ebx >> 6) & 0x3f;
+	}
+
+	ht_gfn = max_pfn - num_ht_pages;
+done:
+	return min(max_gfn, ht_gfn - 1);
+}
-- 
cgit v1.2.3


From 10e7a099bfd860a2b77ea8aaac661f52c16dd865 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Mon, 25 Oct 2021 13:13:11 -0700
Subject: selftests: KVM: Add test to verify KVM doesn't explode on "bad" I/O

Add an x86 selftest to verify that KVM doesn't WARN or otherwise explode
if userspace modifies RCX during a userspace exit to handle string I/O.
This is a regression test for a user-triggerable WARN introduced by
commit 3b27de271839 ("KVM: x86: split the two parts of emulator_pio_in").

Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20211025201311.1881846-3-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/.gitignore             |   1 +
 tools/testing/selftests/kvm/Makefile               |   1 +
 .../selftests/kvm/x86_64/userspace_io_test.c       | 114 +++++++++++++++++++++
 3 files changed, 116 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/x86_64/userspace_io_test.c

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index 3763105029fb..00814c0f87a6 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -30,6 +30,7 @@
 /x86_64/svm_int_ctl_test
 /x86_64/sync_regs_test
 /x86_64/tsc_msrs_test
+/x86_64/userspace_io_test
 /x86_64/userspace_msr_exit_test
 /x86_64/vmx_apic_access_test
 /x86_64/vmx_close_while_nested_test
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index c4e34717826a..f307c9f61981 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -59,6 +59,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test
 TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test
 TEST_GEN_PROGS_x86_64 += x86_64/svm_int_ctl_test
 TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test
+TEST_GEN_PROGS_x86_64 += x86_64/userspace_io_test
 TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
diff --git a/tools/testing/selftests/kvm/x86_64/userspace_io_test.c b/tools/testing/selftests/kvm/x86_64/userspace_io_test.c
new file mode 100644
index 000000000000..e4bef2e05686
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/userspace_io_test.c
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID			1
+
+static void guest_ins_port80(uint8_t *buffer, unsigned int count)
+{
+	unsigned long end;
+
+	if (count == 2)
+		end = (unsigned long)buffer + 1;
+	else
+		end = (unsigned long)buffer + 8192;
+
+	asm volatile("cld; rep; insb" : "+D"(buffer), "+c"(count) : "d"(0x80) : "memory");
+	GUEST_ASSERT_1(count == 0, count);
+	GUEST_ASSERT_2((unsigned long)buffer == end, buffer, end);
+}
+
+static void guest_code(void)
+{
+	uint8_t buffer[8192];
+	int i;
+
+	/*
+	 * Special case tests.  main() will adjust RCX 2 => 1 and 3 => 8192 to
+	 * test that KVM doesn't explode when userspace modifies the "count" on
+	 * a userspace I/O exit.  KVM isn't required to play nice with the I/O
+	 * itself as KVM doesn't support manipulating the count, it just needs
+	 * to not explode or overflow a buffer.
+	 */
+	guest_ins_port80(buffer, 2);
+	guest_ins_port80(buffer, 3);
+
+	/* Verify KVM fills the buffer correctly when not stuffing RCX. */
+	memset(buffer, 0, sizeof(buffer));
+	guest_ins_port80(buffer, 8192);
+	for (i = 0; i < 8192; i++)
+		GUEST_ASSERT_2(buffer[i] == 0xaa, i, buffer[i]);
+
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_regs regs;
+	struct kvm_run *run;
+	struct kvm_vm *vm;
+	struct ucall uc;
+	int rc;
+
+	/* Tell stdout not to buffer its content */
+	setbuf(stdout, NULL);
+
+	/* Create VM */
+	vm = vm_create_default(VCPU_ID, 0, guest_code);
+	run = vcpu_state(vm, VCPU_ID);
+
+	memset(&regs, 0, sizeof(regs));
+
+	while (1) {
+		rc = _vcpu_run(vm, VCPU_ID);
+
+		TEST_ASSERT(rc == 0, "vcpu_run failed: %d\n", rc);
+		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+			    "Unexpected exit reason: %u (%s),\n",
+			    run->exit_reason,
+			    exit_reason_str(run->exit_reason));
+
+		if (get_ucall(vm, VCPU_ID, &uc))
+			break;
+
+		TEST_ASSERT(run->io.port == 0x80,
+			    "Expected I/O at port 0x80, got port 0x%x\n", run->io.port);
+
+		/*
+		 * Modify the rep string count in RCX: 2 => 1 and 3 => 8192.
+		 * Note, this abuses KVM's batching of rep string I/O to avoid
+		 * getting stuck in an infinite loop.  That behavior isn't in
+		 * scope from a testing perspective as it's not ABI in any way,
+		 * i.e. it really is abusing internal KVM knowledge.
+		 */
+		vcpu_regs_get(vm, VCPU_ID, &regs);
+		if (regs.rcx == 2)
+			regs.rcx = 1;
+		if (regs.rcx == 3)
+			regs.rcx = 8192;
+		memset((void *)run + run->io.data_offset, 0xaa, 4096);
+		vcpu_regs_set(vm, VCPU_ID, &regs);
+	}
+
+	switch (uc.cmd) {
+	case UCALL_DONE:
+		break;
+	case UCALL_ABORT:
+		TEST_FAIL("%s at %s:%ld : argN+1 = 0x%lx, argN+2 = 0x%lx",
+			  (const char *)uc.args[0], __FILE__, uc.args[1],
+			  uc.args[2], uc.args[3]);
+	default:
+		TEST_FAIL("Unknown ucall %lu", uc.cmd);
+	}
+
+	kvm_vm_free(vm);
+	return 0;
+}
-- 
cgit v1.2.3


From 0be2516f865f5a876837184a8385163ff64a5889 Mon Sep 17 00:00:00 2001
From: Paul Chaignon <paul@isovalent.com>
Date: Fri, 10 Dec 2021 00:47:00 +0100
Subject: selftests/bpf: Tests for state pruning with u32 spill/fill

This patch adds tests for the verifier's tracking for spilled, <8B
registers. The first two test cases ensure the verifier doesn't
incorrectly prune states in case of <8B spill/fills. The last one simply
checks that a filled u64 register is marked unknown if the register
spilled in the same slack slot was less than 8B.

The map value access at the end of the first program is only incorrect
for the path R6=32. If the precision bit for register R8 isn't
backtracked through the u32 spill/fill, the R6=32 path is pruned at
instruction 9 and the program is incorrectly accepted. The second
program is a variation of the same with u32 spills and a u64 fill.

The additional instructions to introduce the first pruning point may be
a bit fragile as they depend on the heuristics for pruning points in the
verifier (currently at least 8 instructions and 2 jumps). If the
heuristics are changed, the pruning point may move (e.g., to the
subsequent jump) or disappear, which would cause the test to always pass.

Signed-off-by: Paul Chaignon <paul@isovalent.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/verifier/search_pruning.c        | 71 ++++++++++++++++++++++
 tools/testing/selftests/bpf/verifier/spill_fill.c  | 32 ++++++++++
 2 files changed, 103 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/verifier/search_pruning.c b/tools/testing/selftests/bpf/verifier/search_pruning.c
index 7e50cb80873a..682519769fe3 100644
--- a/tools/testing/selftests/bpf/verifier/search_pruning.c
+++ b/tools/testing/selftests/bpf/verifier/search_pruning.c
@@ -132,6 +132,77 @@
 	.result = REJECT,
 	.prog_type = BPF_PROG_TYPE_TRACEPOINT,
 },
+{
+	"precision tracking for u32 spill/fill",
+	.insns = {
+		BPF_MOV64_REG(BPF_REG_7, BPF_REG_1),
+		BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+		BPF_MOV32_IMM(BPF_REG_6, 32),
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+		BPF_MOV32_IMM(BPF_REG_6, 4),
+		/* Additional insns to introduce a pruning point. */
+		BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+		BPF_MOV64_IMM(BPF_REG_3, 0),
+		BPF_MOV64_IMM(BPF_REG_3, 0),
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+		BPF_MOV64_IMM(BPF_REG_3, 0),
+		/* u32 spill/fill */
+		BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_6, -8),
+		BPF_LDX_MEM(BPF_W, BPF_REG_8, BPF_REG_10, -8),
+		/* out-of-bound map value access for r6=32 */
+		BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, 0),
+		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16),
+		BPF_LD_MAP_FD(BPF_REG_1, 0),
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+		BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_8),
+		BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.fixup_map_hash_8b = { 15 },
+	.result = REJECT,
+	.errstr = "R0 min value is outside of the allowed memory range",
+	.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+	"precision tracking for u32 spills, u64 fill",
+	.insns = {
+		BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+		BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+		BPF_MOV32_IMM(BPF_REG_7, 0xffffffff),
+		/* Additional insns to introduce a pruning point. */
+		BPF_MOV64_IMM(BPF_REG_3, 1),
+		BPF_MOV64_IMM(BPF_REG_3, 1),
+		BPF_MOV64_IMM(BPF_REG_3, 1),
+		BPF_MOV64_IMM(BPF_REG_3, 1),
+		BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+		BPF_MOV64_IMM(BPF_REG_3, 1),
+		BPF_ALU32_IMM(BPF_DIV, BPF_REG_3, 0),
+		/* u32 spills, u64 fill */
+		BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_6, -4),
+		BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_7, -8),
+		BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, -8),
+		/* if r8 != X goto pc+1  r8 known in fallthrough branch */
+		BPF_JMP_IMM(BPF_JNE, BPF_REG_8, 0xffffffff, 1),
+		BPF_MOV64_IMM(BPF_REG_3, 1),
+		/* if r8 == X goto pc+1  condition always true on first
+		 * traversal, so starts backtracking to mark r8 as requiring
+		 * precision. r7 marked as needing precision. r6 not marked
+		 * since it's not tracked.
+		 */
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_8, 0xffffffff, 1),
+		/* fails if r8 correctly marked unknown after fill. */
+		BPF_ALU32_IMM(BPF_DIV, BPF_REG_3, 0),
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.result = REJECT,
+	.errstr = "div by zero",
+	.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
 {
 	"allocated_stack",
 	.insns = {
diff --git a/tools/testing/selftests/bpf/verifier/spill_fill.c b/tools/testing/selftests/bpf/verifier/spill_fill.c
index 7ab3de108761..6c907144311f 100644
--- a/tools/testing/selftests/bpf/verifier/spill_fill.c
+++ b/tools/testing/selftests/bpf/verifier/spill_fill.c
@@ -175,6 +175,38 @@
 	.errstr = "invalid access to packet",
 	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 },
+{
+	"Spill u32 const scalars.  Refill as u64.  Offset to skb->data",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+		    offsetof(struct __sk_buff, data)),
+	BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+		    offsetof(struct __sk_buff, data_end)),
+	/* r6 = 0 */
+	BPF_MOV32_IMM(BPF_REG_6, 0),
+	/* r7 = 20 */
+	BPF_MOV32_IMM(BPF_REG_7, 20),
+	/* *(u32 *)(r10 -4) = r6 */
+	BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_6, -4),
+	/* *(u32 *)(r10 -8) = r7 */
+	BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_7, -8),
+	/* r4 = *(u64 *)(r10 -8) */
+	BPF_LDX_MEM(BPF_H, BPF_REG_4, BPF_REG_10, -8),
+	/* r0 = r2 */
+	BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+	/* r0 += r4 R0=pkt R2=pkt R3=pkt_end R4=inv,umax=65535 */
+	BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_4),
+	/* if (r0 > r3) R0=pkt,umax=65535 R2=pkt R3=pkt_end R4=inv,umax=65535 */
+	BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+	/* r0 = *(u32 *)r2 R0=pkt,umax=65535 R2=pkt R3=pkt_end R4=inv20 */
+	BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 0),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.result = REJECT,
+	.errstr = "invalid access to packet",
+	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
 {
 	"Spill a u32 const scalar.  Refill as u16 from fp-6.  Offset to skb->data",
 	.insns = {
-- 
cgit v1.2.3


From db10415448158779127ad529335e2c447c5767c1 Mon Sep 17 00:00:00 2001
From: Ye Guojin <ye.guojin@zte.com.cn>
Date: Fri, 10 Dec 2021 07:14:24 +0000
Subject: selftests: mptcp: remove duplicate include in mptcp_inq.c

'sys/ioctl.h' included in 'mptcp_inq.c' is duplicated.

Reported-by: ZealRobot <zealci@zte.com.cn>
Signed-off-by: Ye Guojin <ye.guojin@zte.com.cn>
Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Link: https://lore.kernel.org/r/20211210071424.425773-1-ye.guojin@zte.com.cn
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_inq.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_inq.c b/tools/testing/selftests/net/mptcp/mptcp_inq.c
index b8debd4fb5ed..29f75e2a1116 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_inq.c
+++ b/tools/testing/selftests/net/mptcp/mptcp_inq.c
@@ -17,7 +17,6 @@
 #include <unistd.h>
 #include <time.h>
 
-#include <sys/ioctl.h>
 #include <sys/ioctl.h>
 #include <sys/socket.h>
 #include <sys/types.h>
-- 
cgit v1.2.3


From 4cf23a3c6359556a1cca489cf2b901e2b904c4b0 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 9 Dec 2021 11:38:29 -0800
Subject: libbpf: Fix bpf_prog_load() log_buf logic for log_level 0

To unify libbpf APIs behavior w.r.t. log_buf and log_level, fix
bpf_prog_load() to follow the same logic as bpf_btf_load() and
high-level bpf_object__load() API will follow in the subsequent patches:
  - if log_level is 0 and non-NULL log_buf is provided by a user, attempt
    load operation initially with no log_buf and log_level set;
  - if successful, we are done, return new FD;
  - on error, retry the load operation with log_level bumped to 1 and
    log_buf set; this way verbose logging will be requested only when we
    are sure that there is a failure, but will be fast in the
    common/expected success case.

Of course, user can still specify log_level > 0 from the very beginning
to force log collection.

Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211209193840.1248570-2-andrii@kernel.org
---
 tools/lib/bpf/bpf.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 4e7836e1a7b5..3dc86342f0a0 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -303,10 +303,6 @@ int bpf_prog_load_v0_6_0(enum bpf_prog_type prog_type,
 	if (log_level && !log_buf)
 		return libbpf_err(-EINVAL);
 
-	attr.log_level = log_level;
-	attr.log_buf = ptr_to_u64(log_buf);
-	attr.log_size = log_size;
-
 	func_info_rec_size = OPTS_GET(opts, func_info_rec_size, 0);
 	func_info = OPTS_GET(opts, func_info, NULL);
 	attr.func_info_rec_size = func_info_rec_size;
@@ -321,6 +317,12 @@ int bpf_prog_load_v0_6_0(enum bpf_prog_type prog_type,
 
 	attr.fd_array = ptr_to_u64(OPTS_GET(opts, fd_array, NULL));
 
+	if (log_level) {
+		attr.log_buf = ptr_to_u64(log_buf);
+		attr.log_size = log_size;
+		attr.log_level = log_level;
+	}
+
 	fd = sys_bpf_prog_load(&attr, sizeof(attr), attempts);
 	if (fd >= 0)
 		return fd;
@@ -366,16 +368,17 @@ int bpf_prog_load_v0_6_0(enum bpf_prog_type prog_type,
 			goto done;
 	}
 
-	if (log_level || !log_buf)
-		goto done;
+	if (log_level == 0 && log_buf) {
+		/* log_level == 0 with non-NULL log_buf requires retrying on error
+		 * with log_level == 1 and log_buf/log_buf_size set, to get details of
+		 * failure
+		 */
+		attr.log_buf = ptr_to_u64(log_buf);
+		attr.log_size = log_size;
+		attr.log_level = 1;
 
-	/* Try again with log */
-	log_buf[0] = 0;
-	attr.log_buf = ptr_to_u64(log_buf);
-	attr.log_size = log_size;
-	attr.log_level = 1;
-
-	fd = sys_bpf_prog_load(&attr, sizeof(attr), attempts);
+		fd = sys_bpf_prog_load(&attr, sizeof(attr), attempts);
+	}
 done:
 	/* free() doesn't affect errno, so we don't need to restore it */
 	free(finfo);
-- 
cgit v1.2.3


From 0ed08d6725b5116aaad7a0082d721286e0a43dca Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 9 Dec 2021 11:38:30 -0800
Subject: libbpf: Add OPTS-based bpf_btf_load() API

Similar to previous bpf_prog_load() and bpf_map_create() APIs, add
bpf_btf_load() API which is taking optional OPTS struct. Schedule
bpf_load_btf() for deprecation in v0.8 ([0]).

This makes naming consistent with BPF_BTF_LOAD command, sets up an API
for extensibility in the future, moves options parameters (log-related
fields) into optional options, and also allows to pass log_level
directly.

It also removes log buffer auto-allocation logic from low-level API
(consistent with bpf_prog_load() behavior), but preserves a special
treatment of log_level == 0 with non-NULL log_buf, which matches
low-level bpf_prog_load() and high-level libbpf APIs for BTF and program
loading behaviors.

  [0] Closes: https://github.com/libbpf/libbpf/issues/419

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211209193840.1248570-3-andrii@kernel.org
---
 tools/lib/bpf/bpf.c           | 59 ++++++++++++++++++++++++++++++++++++-------
 tools/lib/bpf/bpf.h           | 19 ++++++++++++--
 tools/lib/bpf/libbpf.map      |  1 +
 tools/lib/bpf/libbpf_probes.c |  2 +-
 4 files changed, 69 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 3dc86342f0a0..6b2407e12060 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -1047,24 +1047,65 @@ int bpf_raw_tracepoint_open(const char *name, int prog_fd)
 	return libbpf_err_errno(fd);
 }
 
-int bpf_load_btf(const void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size,
-		 bool do_log)
+int bpf_btf_load(const void *btf_data, size_t btf_size, const struct bpf_btf_load_opts *opts)
 {
-	union bpf_attr attr = {};
+	const size_t attr_sz = offsetofend(union bpf_attr, btf_log_level);
+	union bpf_attr attr;
+	char *log_buf;
+	size_t log_size;
+	__u32 log_level;
 	int fd;
 
-	attr.btf = ptr_to_u64(btf);
+	memset(&attr, 0, attr_sz);
+
+	if (!OPTS_VALID(opts, bpf_btf_load_opts))
+		return libbpf_err(-EINVAL);
+
+	log_buf = OPTS_GET(opts, log_buf, NULL);
+	log_size = OPTS_GET(opts, log_size, 0);
+	log_level = OPTS_GET(opts, log_level, 0);
+
+	if (log_size > UINT_MAX)
+		return libbpf_err(-EINVAL);
+	if (log_size && !log_buf)
+		return libbpf_err(-EINVAL);
+
+	attr.btf = ptr_to_u64(btf_data);
 	attr.btf_size = btf_size;
+	/* log_level == 0 and log_buf != NULL means "try loading without
+	 * log_buf, but retry with log_buf and log_level=1 on error", which is
+	 * consistent across low-level and high-level BTF and program loading
+	 * APIs within libbpf and provides a sensible behavior in practice
+	 */
+	if (log_level) {
+		attr.btf_log_buf = ptr_to_u64(log_buf);
+		attr.btf_log_size = (__u32)log_size;
+		attr.btf_log_level = log_level;
+	}
 
-retry:
-	if (do_log && log_buf && log_buf_size) {
-		attr.btf_log_level = 1;
-		attr.btf_log_size = log_buf_size;
+	fd = sys_bpf_fd(BPF_BTF_LOAD, &attr, attr_sz);
+	if (fd < 0 && log_buf && log_level == 0) {
 		attr.btf_log_buf = ptr_to_u64(log_buf);
+		attr.btf_log_size = (__u32)log_size;
+		attr.btf_log_level = 1;
+		fd = sys_bpf_fd(BPF_BTF_LOAD, &attr, attr_sz);
 	}
+	return libbpf_err_errno(fd);
+}
+
+int bpf_load_btf(const void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size, bool do_log)
+{
+	LIBBPF_OPTS(bpf_btf_load_opts, opts);
+	int fd;
 
-	fd = sys_bpf_fd(BPF_BTF_LOAD, &attr, sizeof(attr));
+retry:
+	if (do_log && log_buf && log_buf_size) {
+		opts.log_buf = log_buf;
+		opts.log_size = log_buf_size;
+		opts.log_level = 1;
+	}
 
+	fd = bpf_btf_load(btf, btf_size, &opts);
 	if (fd < 0 && !do_log && log_buf && log_buf_size) {
 		do_log = true;
 		goto retry;
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index f79e5fbcf1c1..5f7d9636643d 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -213,6 +213,23 @@ LIBBPF_API int bpf_verify_program(enum bpf_prog_type type,
 				  char *log_buf, size_t log_buf_sz,
 				  int log_level);
 
+struct bpf_btf_load_opts {
+	size_t sz; /* size of this struct for forward/backward compatibility */
+
+	/* kernel log options */
+	char *log_buf;
+	__u32 log_level;
+	__u32 log_size;
+};
+#define bpf_btf_load_opts__last_field log_size
+
+LIBBPF_API int bpf_btf_load(const void *btf_data, size_t btf_size,
+			    const struct bpf_btf_load_opts *opts);
+
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_btf_load() instead")
+LIBBPF_API int bpf_load_btf(const void *btf, __u32 btf_size, char *log_buf,
+			    __u32 log_buf_size, bool do_log);
+
 LIBBPF_API int bpf_map_update_elem(int fd, const void *key, const void *value,
 				   __u64 flags);
 
@@ -340,8 +357,6 @@ LIBBPF_API int bpf_prog_query(int target_fd, enum bpf_attach_type type,
 			      __u32 query_flags, __u32 *attach_flags,
 			      __u32 *prog_ids, __u32 *prog_cnt);
 LIBBPF_API int bpf_raw_tracepoint_open(const char *name, int prog_fd);
-LIBBPF_API int bpf_load_btf(const void *btf, __u32 btf_size, char *log_buf,
-			    __u32 log_buf_size, bool do_log);
 LIBBPF_API int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf,
 				 __u32 *buf_len, __u32 *prog_id, __u32 *fd_type,
 				 __u64 *probe_offset, __u64 *probe_addr);
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 715df3a27389..08cdfe840436 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -422,6 +422,7 @@ LIBBPF_0.6.0 {
 
 LIBBPF_0.7.0 {
 	global:
+		bpf_btf_load;
 		bpf_program__log_level;
 		bpf_program__set_log_level;
 };
diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
index 41f2be47c2ea..4bdec69523a7 100644
--- a/tools/lib/bpf/libbpf_probes.c
+++ b/tools/lib/bpf/libbpf_probes.c
@@ -164,7 +164,7 @@ int libbpf__load_raw_btf(const char *raw_types, size_t types_len,
 	memcpy(raw_btf + hdr.hdr_len, raw_types, hdr.type_len);
 	memcpy(raw_btf + hdr.hdr_len + hdr.type_len, str_sec, hdr.str_len);
 
-	btf_fd = bpf_load_btf(raw_btf, btf_len, NULL, 0, false);
+	btf_fd = bpf_btf_load(raw_btf, btf_len, NULL);
 
 	free(raw_btf);
 	return btf_fd;
-- 
cgit v1.2.3


From 1a190d1e8eb9ff84354e38f7482dc77b626f3cc9 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 9 Dec 2021 11:38:31 -0800
Subject: libbpf: Allow passing preallocated log_buf when loading BTF into
 kernel

Add libbpf-internal btf_load_into_kernel() that allows to pass
preallocated log_buf and custom log_level to be passed into kernel
during BPF_BTF_LOAD call. When custom log_buf is provided,
btf_load_into_kernel() won't attempt an retry with automatically
allocated internal temporary buffer to capture BTF validation log.

It's important to note the relation between log_buf and log_level, which
slightly deviates from stricter kernel logic. From kernel's POV, if
log_buf is specified, log_level has to be > 0, and vice versa. While
kernel has good reasons to request such "sanity, this, in practice, is
a bit unconvenient and restrictive for libbpf's high-level bpf_object APIs.

So libbpf will allow to set non-NULL log_buf and log_level == 0. This is
fine and means to attempt to load BTF without logging requested, but if
it failes, retry the load with custom log_buf and log_level 1. Similar
logic will be implemented for program loading. In practice this means
that users can provide custom log buffer just in case error happens, but
not really request slower verbose logging all the time. This is also
consistent with libbpf behavior when custom log_buf is not set: libbpf
first tries to load everything with log_level=0, and only if error
happens allocates internal log buffer and retries with log_level=1.

Also, while at it, make BTF validation log more obvious and follow the log
pattern libbpf is using for dumping BPF verifier log during
BPF_PROG_LOAD. BTF loading resulting in an error will look like this:

libbpf: BTF loading error: -22
libbpf: -- BEGIN BTF LOAD LOG ---
magic: 0xeb9f
version: 1
flags: 0x0
hdr_len: 24
type_off: 0
type_len: 1040
str_off: 1040
str_len: 2063598257
btf_total_size: 1753
Total section length too long
-- END BTF LOAD LOG --
libbpf: Error loading .BTF into kernel: -22. BTF is optional, ignoring.

This makes it much easier to find relevant parts in libbpf log output.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211209193840.1248570-4-andrii@kernel.org
---
 tools/lib/bpf/btf.c             | 78 +++++++++++++++++++++++++++++------------
 tools/lib/bpf/libbpf_internal.h |  1 +
 2 files changed, 56 insertions(+), 23 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 0d7b16eab569..e171424192ae 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -1124,54 +1124,86 @@ struct btf *btf__parse_split(const char *path, struct btf *base_btf)
 
 static void *btf_get_raw_data(const struct btf *btf, __u32 *size, bool swap_endian);
 
-int btf__load_into_kernel(struct btf *btf)
+int btf_load_into_kernel(struct btf *btf, char *log_buf, size_t log_sz, __u32 log_level)
 {
-	__u32 log_buf_size = 0, raw_size;
-	char *log_buf = NULL;
+	LIBBPF_OPTS(bpf_btf_load_opts, opts);
+	__u32 buf_sz = 0, raw_size;
+	char *buf = NULL, *tmp;
 	void *raw_data;
 	int err = 0;
 
 	if (btf->fd >= 0)
 		return libbpf_err(-EEXIST);
+	if (log_sz && !log_buf)
+		return libbpf_err(-EINVAL);
 
-retry_load:
-	if (log_buf_size) {
-		log_buf = malloc(log_buf_size);
-		if (!log_buf)
-			return libbpf_err(-ENOMEM);
-
-		*log_buf = 0;
-	}
-
+	/* cache native raw data representation */
 	raw_data = btf_get_raw_data(btf, &raw_size, false);
 	if (!raw_data) {
 		err = -ENOMEM;
 		goto done;
 	}
-	/* cache native raw data representation */
 	btf->raw_size = raw_size;
 	btf->raw_data = raw_data;
 
-	btf->fd = bpf_load_btf(raw_data, raw_size, log_buf, log_buf_size, false);
+retry_load:
+	/* if log_level is 0, we won't provide log_buf/log_size to the kernel,
+	 * initially. Only if BTF loading fails, we bump log_level to 1 and
+	 * retry, using either auto-allocated or custom log_buf. This way
+	 * non-NULL custom log_buf provides a buffer just in case, but hopes
+	 * for successful load and no need for log_buf.
+	 */
+	if (log_level) {
+		/* if caller didn't provide custom log_buf, we'll keep
+		 * allocating our own progressively bigger buffers for BTF
+		 * verification log
+		 */
+		if (!log_buf) {
+			buf_sz = max((__u32)BPF_LOG_BUF_SIZE, buf_sz * 2);
+			tmp = realloc(buf, buf_sz);
+			if (!tmp) {
+				err = -ENOMEM;
+				goto done;
+			}
+			buf = tmp;
+			buf[0] = '\0';
+		}
+
+		opts.log_buf = log_buf ? log_buf : buf;
+		opts.log_size = log_buf ? log_sz : buf_sz;
+		opts.log_level = log_level;
+	}
+
+	btf->fd = bpf_btf_load(raw_data, raw_size, &opts);
 	if (btf->fd < 0) {
-		if (!log_buf || errno == ENOSPC) {
-			log_buf_size = max((__u32)BPF_LOG_BUF_SIZE,
-					   log_buf_size << 1);
-			free(log_buf);
+		/* time to turn on verbose mode and try again */
+		if (log_level == 0) {
+			log_level = 1;
 			goto retry_load;
 		}
+		/* only retry if caller didn't provide custom log_buf, but
+		 * make sure we can never overflow buf_sz
+		 */
+		if (!log_buf && errno == ENOSPC && buf_sz <= UINT_MAX / 2)
+			goto retry_load;
 
 		err = -errno;
-		pr_warn("Error loading BTF: %s(%d)\n", strerror(errno), errno);
-		if (*log_buf)
-			pr_warn("%s\n", log_buf);
-		goto done;
+		pr_warn("BTF loading error: %d\n", err);
+		/* don't print out contents of custom log_buf */
+		if (!log_buf && buf[0])
+			pr_warn("-- BEGIN BTF LOAD LOG ---\n%s\n-- END BTF LOAD LOG --\n", buf);
 	}
 
 done:
-	free(log_buf);
+	free(buf);
 	return libbpf_err(err);
 }
+
+int btf__load_into_kernel(struct btf *btf)
+{
+	return btf_load_into_kernel(btf, NULL, 0, 0);
+}
+
 int btf__load(struct btf *) __attribute__((alias("btf__load_into_kernel")));
 
 int btf__fd(const struct btf *btf)
diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index 6f143e9e810c..355c41019aed 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -277,6 +277,7 @@ int parse_cpu_mask_str(const char *s, bool **mask, int *mask_sz);
 int parse_cpu_mask_file(const char *fcpu, bool **mask, int *mask_sz);
 int libbpf__load_raw_btf(const char *raw_types, size_t types_len,
 			 const char *str_sec, size_t str_len);
+int btf_load_into_kernel(struct btf *btf, char *log_buf, size_t log_sz, __u32 log_level);
 
 struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf);
 void btf_get_kernel_prefix_kind(enum bpf_attach_type attach_type,
-- 
cgit v1.2.3


From e0e3ea888c69b4ea17133b8ac8dfd5066a759b5a Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 9 Dec 2021 11:38:32 -0800
Subject: libbpf: Allow passing user log setting through bpf_object_open_opts

Allow users to provide their own custom log_buf, log_size, and log_level
at bpf_object level through bpf_object_open_opts. This log_buf will be
used during BTF loading. Subsequent patch will use same log_buf during
BPF program loading, unless overriden at per-bpf_program level.

When such custom log_buf is provided, libbpf won't be attempting
retrying loading of BTF to try to provide its own log buffer to capture
kernel's error log output. User is responsible to provide big enough
buffer, otherwise they run a risk of getting -ENOSPC error from the
bpf() syscall.

See also comments in bpf_object_open_opts regarding log_level and
log_buf interactions.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211209193840.1248570-5-andrii@kernel.org
---
 tools/lib/bpf/bpf.h    |  3 ++-
 tools/lib/bpf/libbpf.c | 24 +++++++++++++++++++++++-
 tools/lib/bpf/libbpf.h | 41 ++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 65 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 5f7d9636643d..94e553a0ff9d 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -195,8 +195,9 @@ struct bpf_load_program_attr {
 /* Flags to direct loading requirements */
 #define MAPS_RELAX_COMPAT	0x01
 
-/* Recommend log buffer size */
+/* Recommended log buffer size */
 #define BPF_LOG_BUF_SIZE (UINT32_MAX >> 8) /* verifier maximum in kernels <= 5.1 */
+
 LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_prog_load() instead")
 LIBBPF_API int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr,
 				      char *log_buf, size_t log_buf_sz);
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 6db0b5e8540e..38999e9c08e0 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -573,6 +573,11 @@ struct bpf_object {
 	size_t btf_module_cnt;
 	size_t btf_module_cap;
 
+	/* optional log settings passed to BPF_BTF_LOAD and BPF_PROG_LOAD commands */
+	char *log_buf;
+	size_t log_size;
+	__u32 log_level;
+
 	void *priv;
 	bpf_object_clear_priv_t clear_priv;
 
@@ -3017,7 +3022,9 @@ static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj)
 		 */
 		btf__set_fd(kern_btf, 0);
 	} else {
-		err = btf__load_into_kernel(kern_btf);
+		/* currently BPF_BTF_LOAD only supports log_level 1 */
+		err = btf_load_into_kernel(kern_btf, obj->log_buf, obj->log_size,
+					   obj->log_level ? 1 : 0);
 	}
 	if (sanitize) {
 		if (!err) {
@@ -6932,6 +6939,9 @@ __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz,
 	struct bpf_object *obj;
 	char tmp_name[64];
 	int err;
+	char *log_buf;
+	size_t log_size;
+	__u32 log_level;
 
 	if (elf_version(EV_CURRENT) == EV_NONE) {
 		pr_warn("failed to init libelf for %s\n",
@@ -6954,10 +6964,22 @@ __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz,
 		pr_debug("loading object '%s' from buffer\n", obj_name);
 	}
 
+	log_buf = OPTS_GET(opts, kernel_log_buf, NULL);
+	log_size = OPTS_GET(opts, kernel_log_size, 0);
+	log_level = OPTS_GET(opts, kernel_log_level, 0);
+	if (log_size > UINT_MAX)
+		return ERR_PTR(-EINVAL);
+	if (log_size && !log_buf)
+		return ERR_PTR(-EINVAL);
+
 	obj = bpf_object__new(path, obj_buf, obj_buf_sz, obj_name);
 	if (IS_ERR(obj))
 		return obj;
 
+	obj->log_buf = log_buf;
+	obj->log_size = log_size;
+	obj->log_level = log_level;
+
 	btf_tmp_path = OPTS_GET(opts, btf_custom_path, NULL);
 	if (btf_tmp_path) {
 		if (strlen(btf_tmp_path) >= PATH_MAX) {
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 4802c1e736c3..5c984c63859f 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -108,8 +108,47 @@ struct bpf_object_open_opts {
 	 * struct_ops, etc) will need actual kernel BTF at /sys/kernel/btf/vmlinux.
 	 */
 	const char *btf_custom_path;
+	/* Pointer to a buffer for storing kernel logs for applicable BPF
+	 * commands. Valid kernel_log_size has to be specified as well and are
+	 * passed-through to bpf() syscall. Keep in mind that kernel might
+	 * fail operation with -ENOSPC error if provided buffer is too small
+	 * to contain entire log output.
+	 * See the comment below for kernel_log_level for interaction between
+	 * log_buf and log_level settings.
+	 *
+	 * If specified, this log buffer will be passed for:
+	 *   - each BPF progral load (BPF_PROG_LOAD) attempt, unless overriden
+	 *     with bpf_program__set_log() on per-program level, to get
+	 *     BPF verifier log output.
+	 *   - during BPF object's BTF load into kernel (BPF_BTF_LOAD) to get
+	 *     BTF sanity checking log.
+	 *
+	 * Each BPF command (BPF_BTF_LOAD or BPF_PROG_LOAD) will overwrite
+	 * previous contents, so if you need more fine-grained control, set
+	 * per-program buffer with bpf_program__set_log_buf() to preserve each
+	 * individual program's verification log. Keep using kernel_log_buf
+	 * for BTF verification log, if necessary.
+	 */
+	char *kernel_log_buf;
+	size_t kernel_log_size;
+	/*
+	 * Log level can be set independently from log buffer. Log_level=0
+	 * means that libbpf will attempt loading BTF or program without any
+	 * logging requested, but will retry with either its own or custom log
+	 * buffer, if provided, and log_level=1 on any error.
+	 * And vice versa, setting log_level>0 will request BTF or prog
+	 * loading with verbose log from the first attempt (and as such also
+	 * for successfully loaded BTF or program), and the actual log buffer
+	 * could be either libbpf's own auto-allocated log buffer, if
+	 * kernel_log_buffer is NULL, or user-provided custom kernel_log_buf.
+	 * If user didn't provide custom log buffer, libbpf will emit captured
+	 * logs through its print callback.
+	 */
+	__u32 kernel_log_level;
+
+	size_t :0;
 };
-#define bpf_object_open_opts__last_field btf_custom_path
+#define bpf_object_open_opts__last_field kernel_log_level
 
 LIBBPF_API struct bpf_object *bpf_object__open(const char *path);
 
-- 
cgit v1.2.3


From ad9a7f96445b70c415d8e193f854321b110c890a Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 9 Dec 2021 11:38:33 -0800
Subject: libbpf: Improve logging around BPF program loading

Add missing "prog '%s': " prefixes in few places and use consistently
markers for beginning and end of program load logs. Here's an example of
log output:

libbpf: prog 'handler': BPF program load failed: Permission denied
libbpf: -- BEGIN PROG LOAD LOG ---
arg#0 reference type('UNKNOWN ') size cannot be determined: -22
; out1 = in1;
0: (18) r1 = 0xffffc9000cdcc000
2: (61) r1 = *(u32 *)(r1 +0)

...

81: (63) *(u32 *)(r4 +0) = r5
 R1_w=map_value(id=0,off=16,ks=4,vs=20,imm=0) R4=map_value(id=0,off=400,ks=4,vs=16,imm=0)
invalid access to map value, value_size=16 off=400 size=4
R4 min value is outside of the allowed memory range
processed 63 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0
 -- END PROG LOAD LOG --
libbpf: failed to load program 'handler'
libbpf: failed to load object 'test_skeleton'

The entire verifier log, including BEGIN and END markers are now always
youtput during a single print callback call. This should make it much
easier to post-process or parse it, if necessary. It's not an explicit
API guarantee, but it can be reasonably expected to stay like that.

Also __bpf_object__open is renamed to bpf_object_open() as it's always
an adventure to find the exact function that implements bpf_object's
open phase, so drop the double underscored and use internal libbpf
naming convention.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211209193840.1248570-6-andrii@kernel.org
---
 tools/lib/bpf/libbpf.c                             | 38 +++++++++++-----------
 .../testing/selftests/bpf/prog_tests/bpf_tcp_ca.c  |  6 ++--
 2 files changed, 23 insertions(+), 21 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 38999e9c08e0..f07ff39a9d20 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -6662,8 +6662,10 @@ retry_load:
 	ret = bpf_prog_load(prog->type, prog_name, license, insns, insns_cnt, &load_attr);
 
 	if (ret >= 0) {
-		if (log_buf && load_attr.log_level)
-			pr_debug("verifier log:\n%s", log_buf);
+		if (log_buf && load_attr.log_level) {
+			pr_debug("prog '%s': -- BEGIN PROG LOAD LOG --\n%s-- END PROG LOAD LOG --\n",
+				 prog->name, log_buf);
+		}
 
 		if (obj->has_rodata && kernel_supports(obj, FEAT_PROG_BIND_MAP)) {
 			struct bpf_map *map;
@@ -6676,8 +6678,8 @@ retry_load:
 
 				if (bpf_prog_bind_map(ret, bpf_map__fd(map), NULL)) {
 					cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg));
-					pr_warn("prog '%s': failed to bind .rodata map: %s\n",
-						prog->name, cp);
+					pr_warn("prog '%s': failed to bind map '%s': %s\n",
+						prog->name, map->real_name, cp);
 					/* Don't fail hard if can't bind rodata. */
 				}
 			}
@@ -6691,23 +6693,22 @@ retry_load:
 	if (!log_buf || errno == ENOSPC) {
 		log_buf_size = max((size_t)BPF_LOG_BUF_SIZE,
 				   log_buf_size << 1);
-
 		free(log_buf);
 		goto retry_load;
 	}
 	ret = errno ? -errno : -LIBBPF_ERRNO__LOAD;
 	cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg));
-	pr_warn("load bpf program failed: %s\n", cp);
+	pr_warn("prog '%s': BPF program load failed: %s\n", prog->name, cp);
 	pr_perm_msg(ret);
 
 	if (log_buf && log_buf[0] != '\0') {
 		ret = -LIBBPF_ERRNO__VERIFY;
-		pr_warn("-- BEGIN DUMP LOG ---\n");
-		pr_warn("\n%s\n", log_buf);
-		pr_warn("-- END LOG --\n");
-	} else if (insns_cnt >= BPF_MAXINSNS) {
-		pr_warn("Program too large (%d insns), at most %d insns\n",
-			insns_cnt, BPF_MAXINSNS);
+		pr_warn("prog '%s': -- BEGIN PROG LOAD LOG --\n%s-- END PROG LOAD LOG --\n",
+			prog->name, log_buf);
+	}
+	if (insns_cnt >= BPF_MAXINSNS) {
+		pr_warn("prog '%s': program too large (%d insns), at most %d insns\n",
+			prog->name, insns_cnt, BPF_MAXINSNS);
 		ret = -LIBBPF_ERRNO__PROG2BIG;
 	} else if (prog->type != BPF_PROG_TYPE_KPROBE) {
 		/* Wrong program type? */
@@ -6931,9 +6932,8 @@ static int bpf_object_init_progs(struct bpf_object *obj, const struct bpf_object
 	return 0;
 }
 
-static struct bpf_object *
-__bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz,
-		   const struct bpf_object_open_opts *opts)
+static struct bpf_object *bpf_object_open(const char *path, const void *obj_buf, size_t obj_buf_sz,
+					  const struct bpf_object_open_opts *opts)
 {
 	const char *obj_name, *kconfig, *btf_tmp_path;
 	struct bpf_object *obj;
@@ -7033,7 +7033,7 @@ __bpf_object__open_xattr(struct bpf_object_open_attr *attr, int flags)
 		return NULL;
 
 	pr_debug("loading %s\n", attr->file);
-	return __bpf_object__open(attr->file, NULL, 0, &opts);
+	return bpf_object_open(attr->file, NULL, 0, &opts);
 }
 
 struct bpf_object *bpf_object__open_xattr(struct bpf_object_open_attr *attr)
@@ -7059,7 +7059,7 @@ bpf_object__open_file(const char *path, const struct bpf_object_open_opts *opts)
 
 	pr_debug("loading %s\n", path);
 
-	return libbpf_ptr(__bpf_object__open(path, NULL, 0, opts));
+	return libbpf_ptr(bpf_object_open(path, NULL, 0, opts));
 }
 
 struct bpf_object *
@@ -7069,7 +7069,7 @@ bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz,
 	if (!obj_buf || obj_buf_sz == 0)
 		return libbpf_err_ptr(-EINVAL);
 
-	return libbpf_ptr(__bpf_object__open(NULL, obj_buf, obj_buf_sz, opts));
+	return libbpf_ptr(bpf_object_open(NULL, obj_buf, obj_buf_sz, opts));
 }
 
 struct bpf_object *
@@ -7086,7 +7086,7 @@ bpf_object__open_buffer(const void *obj_buf, size_t obj_buf_sz,
 	if (!obj_buf || obj_buf_sz == 0)
 		return errno = EINVAL, NULL;
 
-	return libbpf_ptr(__bpf_object__open(NULL, obj_buf, obj_buf_sz, &opts));
+	return libbpf_ptr(bpf_object_open(NULL, obj_buf, obj_buf_sz, &opts));
 }
 
 static int bpf_object_unload(struct bpf_object *obj)
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
index 94e03df69d71..8daca0ac909f 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
@@ -217,14 +217,16 @@ static bool found;
 static int libbpf_debug_print(enum libbpf_print_level level,
 			      const char *format, va_list args)
 {
-	char *log_buf;
+	const char *log_buf;
 
 	if (level != LIBBPF_WARN ||
-	    strcmp(format, "libbpf: \n%s\n")) {
+	    !strstr(format, "-- BEGIN PROG LOAD LOG --")) {
 		vprintf(format, args);
 		return 0;
 	}
 
+	/* skip prog_name */
+	va_arg(args, char *);
 	log_buf = va_arg(args, char *);
 	if (!log_buf)
 		goto out;
-- 
cgit v1.2.3


From 2eda2145ebfc76569fd088f46356203fc0c785a1 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 9 Dec 2021 11:38:34 -0800
Subject: libbpf: Preserve kernel error code and remove kprobe prog type
 guessing

Instead of rewriting error code returned by the kernel of prog load with
libbpf-sepcific variants pass through the original error.

There is now also no need to have a backup generic -LIBBPF_ERRNO__LOAD
fallback error as bpf_prog_load() guarantees that errno will be properly
set no matter what.

Also drop a completely outdated and pretty useless BPF_PROG_TYPE_KPROBE
guess logic. It's not necessary and neither it's helpful in modern BPF
applications.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211209193840.1248570-7-andrii@kernel.org
---
 tools/lib/bpf/libbpf.c | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index f07ff39a9d20..3fd4e3d5a11f 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -6696,34 +6696,19 @@ retry_load:
 		free(log_buf);
 		goto retry_load;
 	}
-	ret = errno ? -errno : -LIBBPF_ERRNO__LOAD;
+
+	ret = -errno;
 	cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg));
 	pr_warn("prog '%s': BPF program load failed: %s\n", prog->name, cp);
 	pr_perm_msg(ret);
 
 	if (log_buf && log_buf[0] != '\0') {
-		ret = -LIBBPF_ERRNO__VERIFY;
 		pr_warn("prog '%s': -- BEGIN PROG LOAD LOG --\n%s-- END PROG LOAD LOG --\n",
 			prog->name, log_buf);
 	}
 	if (insns_cnt >= BPF_MAXINSNS) {
 		pr_warn("prog '%s': program too large (%d insns), at most %d insns\n",
 			prog->name, insns_cnt, BPF_MAXINSNS);
-		ret = -LIBBPF_ERRNO__PROG2BIG;
-	} else if (prog->type != BPF_PROG_TYPE_KPROBE) {
-		/* Wrong program type? */
-		int fd;
-
-		load_attr.expected_attach_type = 0;
-		load_attr.log_buf = NULL;
-		load_attr.log_size = 0;
-		fd = bpf_prog_load(BPF_PROG_TYPE_KPROBE, prog_name, license,
-				   insns, insns_cnt, &load_attr);
-		if (fd >= 0) {
-			close(fd);
-			ret = -LIBBPF_ERRNO__PROGTYPE;
-			goto out;
-		}
 	}
 
 out:
-- 
cgit v1.2.3


From b3ce907950350a58880b94fed2b6022f160b8b9a Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 9 Dec 2021 11:38:35 -0800
Subject: libbpf: Add per-program log buffer setter and getter

Allow to set user-provided log buffer on a per-program basis ([0]). This
gives great deal of flexibility in terms of which programs are loaded
with logging enabled and where corresponding logs go.

Log buffer set with bpf_program__set_log_buf() overrides kernel_log_buf
and kernel_log_size settings set at bpf_object open time through
bpf_object_open_opts, if any.

Adjust bpf_object_load_prog_instance() logic to not perform own log buf
allocation and load retry if custom log buffer is provided by the user.

  [0] Closes: https://github.com/libbpf/libbpf/issues/418

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211209193840.1248570-8-andrii@kernel.org
---
 tools/lib/bpf/libbpf.c   | 92 +++++++++++++++++++++++++++++++++++++++---------
 tools/lib/bpf/libbpf.h   |  7 ++++
 tools/lib/bpf/libbpf.map |  2 ++
 3 files changed, 84 insertions(+), 17 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 3fd4e3d5a11f..e3e56bebd014 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -331,7 +331,11 @@ struct bpf_program {
 
 	struct reloc_desc *reloc_desc;
 	int nr_reloc;
-	int log_level;
+
+	/* BPF verifier log settings */
+	char *log_buf;
+	size_t log_size;
+	__u32 log_level;
 
 	struct {
 		int nr;
@@ -713,6 +717,9 @@ bpf_object__init_prog(struct bpf_object *obj, struct bpf_program *prog,
 	prog->instances.fds = NULL;
 	prog->instances.nr = -1;
 
+	/* inherit object's log_level */
+	prog->log_level = obj->log_level;
+
 	prog->sec_name = strdup(sec_name);
 	if (!prog->sec_name)
 		goto errout;
@@ -6591,8 +6598,10 @@ static int bpf_object_load_prog_instance(struct bpf_object *obj, struct bpf_prog
 	const char *prog_name = NULL;
 	char *cp, errmsg[STRERR_BUFSIZE];
 	size_t log_buf_size = 0;
-	char *log_buf = NULL;
+	char *log_buf = NULL, *tmp;
 	int btf_fd, ret, err;
+	bool own_log_buf = true;
+	__u32 log_level = prog->log_level;
 
 	if (prog->type == BPF_PROG_TYPE_UNSPEC) {
 		/*
@@ -6627,7 +6636,7 @@ static int bpf_object_load_prog_instance(struct bpf_object *obj, struct bpf_prog
 		load_attr.line_info_rec_size = prog->line_info_rec_size;
 		load_attr.line_info_cnt = prog->line_info_cnt;
 	}
-	load_attr.log_level = prog->log_level;
+	load_attr.log_level = log_level;
 	load_attr.prog_flags = prog->prog_flags;
 	load_attr.fd_array = obj->fd_array;
 
@@ -6648,21 +6657,42 @@ static int bpf_object_load_prog_instance(struct bpf_object *obj, struct bpf_prog
 		*prog_fd = -1;
 		return 0;
 	}
-retry_load:
-	if (log_buf_size) {
-		log_buf = malloc(log_buf_size);
-		if (!log_buf)
-			return -ENOMEM;
 
-		*log_buf = 0;
+retry_load:
+	/* if log_level is zero, we don't request logs initiallly even if
+	 * custom log_buf is specified; if the program load fails, then we'll
+	 * bump log_level to 1 and use either custom log_buf or we'll allocate
+	 * our own and retry the load to get details on what failed
+	 */
+	if (log_level) {
+		if (prog->log_buf) {
+			log_buf = prog->log_buf;
+			log_buf_size = prog->log_size;
+			own_log_buf = false;
+		} else if (obj->log_buf) {
+			log_buf = obj->log_buf;
+			log_buf_size = obj->log_size;
+			own_log_buf = false;
+		} else {
+			log_buf_size = max((size_t)BPF_LOG_BUF_SIZE, log_buf_size * 2);
+			tmp = realloc(log_buf, log_buf_size);
+			if (!tmp) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			log_buf = tmp;
+			log_buf[0] = '\0';
+			own_log_buf = true;
+		}
 	}
 
 	load_attr.log_buf = log_buf;
 	load_attr.log_size = log_buf_size;
-	ret = bpf_prog_load(prog->type, prog_name, license, insns, insns_cnt, &load_attr);
+	load_attr.log_level = log_level;
 
+	ret = bpf_prog_load(prog->type, prog_name, license, insns, insns_cnt, &load_attr);
 	if (ret >= 0) {
-		if (log_buf && load_attr.log_level) {
+		if (log_level && own_log_buf) {
 			pr_debug("prog '%s': -- BEGIN PROG LOAD LOG --\n%s-- END PROG LOAD LOG --\n",
 				 prog->name, log_buf);
 		}
@@ -6690,19 +6720,26 @@ retry_load:
 		goto out;
 	}
 
-	if (!log_buf || errno == ENOSPC) {
-		log_buf_size = max((size_t)BPF_LOG_BUF_SIZE,
-				   log_buf_size << 1);
-		free(log_buf);
+	if (log_level == 0) {
+		log_level = 1;
 		goto retry_load;
 	}
+	/* On ENOSPC, increase log buffer size and retry, unless custom
+	 * log_buf is specified.
+	 * Be careful to not overflow u32, though. Kernel's log buf size limit
+	 * isn't part of UAPI so it can always be bumped to full 4GB. So don't
+	 * multiply by 2 unless we are sure we'll fit within 32 bits.
+	 * Currently, we'll get -EINVAL when we reach (UINT_MAX >> 2).
+	 */
+	if (own_log_buf && errno == ENOSPC && log_buf_size <= UINT_MAX / 2)
+		goto retry_load;
 
 	ret = -errno;
 	cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg));
 	pr_warn("prog '%s': BPF program load failed: %s\n", prog->name, cp);
 	pr_perm_msg(ret);
 
-	if (log_buf && log_buf[0] != '\0') {
+	if (own_log_buf && log_buf && log_buf[0] != '\0') {
 		pr_warn("prog '%s': -- BEGIN PROG LOAD LOG --\n%s-- END PROG LOAD LOG --\n",
 			prog->name, log_buf);
 	}
@@ -6712,7 +6749,8 @@ retry_load:
 	}
 
 out:
-	free(log_buf);
+	if (own_log_buf)
+		free(log_buf);
 	return ret;
 }
 
@@ -8498,6 +8536,26 @@ int bpf_program__set_log_level(struct bpf_program *prog, __u32 log_level)
 	return 0;
 }
 
+const char *bpf_program__log_buf(const struct bpf_program *prog, size_t *log_size)
+{
+	*log_size = prog->log_size;
+	return prog->log_buf;
+}
+
+int bpf_program__set_log_buf(struct bpf_program *prog, char *log_buf, size_t log_size)
+{
+	if (log_size && !log_buf)
+		return -EINVAL;
+	if (prog->log_size > UINT_MAX)
+		return -EINVAL;
+	if (prog->obj->loaded)
+		return -EBUSY;
+
+	prog->log_buf = log_buf;
+	prog->log_size = log_size;
+	return 0;
+}
+
 #define SEC_DEF(sec_pfx, ptype, atype, flags, ...) {			    \
 	.sec = sec_pfx,							    \
 	.prog_type = BPF_PROG_TYPE_##ptype,				    \
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 5c984c63859f..dacde55bebff 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -591,8 +591,15 @@ bpf_program__set_expected_attach_type(struct bpf_program *prog,
 
 LIBBPF_API __u32 bpf_program__flags(const struct bpf_program *prog);
 LIBBPF_API int bpf_program__set_flags(struct bpf_program *prog, __u32 flags);
+
+/* Per-program log level and log buffer getters/setters.
+ * See bpf_object_open_opts comments regarding log_level and log_buf
+ * interactions.
+ */
 LIBBPF_API __u32 bpf_program__log_level(const struct bpf_program *prog);
 LIBBPF_API int bpf_program__set_log_level(struct bpf_program *prog, __u32 log_level);
+LIBBPF_API const char *bpf_program__log_buf(const struct bpf_program *prog, size_t *log_size);
+LIBBPF_API int bpf_program__set_log_buf(struct bpf_program *prog, char *log_buf, size_t log_size);
 
 LIBBPF_API int
 bpf_program__set_attach_target(struct bpf_program *prog, int attach_prog_fd,
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 08cdfe840436..4d483af7dba6 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -423,6 +423,8 @@ LIBBPF_0.6.0 {
 LIBBPF_0.7.0 {
 	global:
 		bpf_btf_load;
+		bpf_program__log_buf;
 		bpf_program__log_level;
+		bpf_program__set_log_buf;
 		bpf_program__set_log_level;
 };
-- 
cgit v1.2.3


From e7b924ca715f0d1c0be62b205c36c4076b335421 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 9 Dec 2021 11:38:36 -0800
Subject: libbpf: Deprecate bpf_object__load_xattr()

Deprecate non-extensible bpf_object__load_xattr() in v0.8 ([0]).

With log_level control through bpf_object_open_opts or
bpf_program__set_log_level(), we are finally at the point where
bpf_object__load_xattr() doesn't provide any functionality that can't be
accessed through other (better) ways. The other feature,
target_btf_path, is also controllable through bpf_object_open_opts.

  [0] Closes: https://github.com/libbpf/libbpf/issues/289

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211209193840.1248570-9-andrii@kernel.org
---
 tools/lib/bpf/libbpf.c | 23 ++++++++++-------------
 tools/lib/bpf/libbpf.h |  1 +
 2 files changed, 11 insertions(+), 13 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index e3e56bebd014..18d95c6a89fe 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -7462,14 +7462,10 @@ static int bpf_object__resolve_externs(struct bpf_object *obj,
 	return 0;
 }
 
-int bpf_object__load_xattr(struct bpf_object_load_attr *attr)
+static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const char *target_btf_path)
 {
-	struct bpf_object *obj;
 	int err, i;
 
-	if (!attr)
-		return libbpf_err(-EINVAL);
-	obj = attr->obj;
 	if (!obj)
 		return libbpf_err(-EINVAL);
 
@@ -7479,7 +7475,7 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr)
 	}
 
 	if (obj->gen_loader)
-		bpf_gen__init(obj->gen_loader, attr->log_level);
+		bpf_gen__init(obj->gen_loader, extra_log_level);
 
 	err = bpf_object__probe_loading(obj);
 	err = err ? : bpf_object__load_vmlinux_btf(obj, false);
@@ -7488,8 +7484,8 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr)
 	err = err ? : bpf_object__sanitize_maps(obj);
 	err = err ? : bpf_object__init_kern_struct_ops_maps(obj);
 	err = err ? : bpf_object__create_maps(obj);
-	err = err ? : bpf_object__relocate(obj, obj->btf_custom_path ? : attr->target_btf_path);
-	err = err ? : bpf_object__load_progs(obj, attr->log_level);
+	err = err ? : bpf_object__relocate(obj, obj->btf_custom_path ? : target_btf_path);
+	err = err ? : bpf_object__load_progs(obj, extra_log_level);
 	err = err ? : bpf_object_init_prog_arrays(obj);
 
 	if (obj->gen_loader) {
@@ -7534,13 +7530,14 @@ out:
 	return libbpf_err(err);
 }
 
-int bpf_object__load(struct bpf_object *obj)
+int bpf_object__load_xattr(struct bpf_object_load_attr *attr)
 {
-	struct bpf_object_load_attr attr = {
-		.obj = obj,
-	};
+	return bpf_object_load(attr->obj, attr->log_level, attr->target_btf_path);
+}
 
-	return bpf_object__load_xattr(&attr);
+int bpf_object__load(struct bpf_object *obj)
+{
+	return bpf_object_load(obj, 0, NULL);
 }
 
 static int make_parent_dir(const char *path)
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index dacde55bebff..a8b894dae633 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -214,6 +214,7 @@ struct bpf_object_load_attr {
 
 /* Load/unload object into/from kernel */
 LIBBPF_API int bpf_object__load(struct bpf_object *obj);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_object__load() instead")
 LIBBPF_API int bpf_object__load_xattr(struct bpf_object_load_attr *attr);
 LIBBPF_DEPRECATED_SINCE(0, 6, "bpf_object__unload() is deprecated, use bpf_object__close() instead")
 LIBBPF_API int bpf_object__unload(struct bpf_object *obj);
-- 
cgit v1.2.3


From dc94121b5ca17adaaabb7959c10d9c6ea504f7b1 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 9 Dec 2021 11:38:37 -0800
Subject: selftests/bpf: Replace all uses of bpf_load_btf() with bpf_btf_load()

Switch all selftests uses of to-be-deprecated bpf_load_btf() with
equivalent bpf_btf_load() calls.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211209193840.1248570-10-andrii@kernel.org
---
 .../selftests/bpf/map_tests/sk_storage_map.c       |  2 +-
 tools/testing/selftests/bpf/prog_tests/btf.c       | 50 +++++++++++++---------
 tools/testing/selftests/bpf/test_verifier.c        |  2 +-
 3 files changed, 32 insertions(+), 22 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/map_tests/sk_storage_map.c b/tools/testing/selftests/bpf/map_tests/sk_storage_map.c
index 8eea4ffeb092..099eb4dfd4f7 100644
--- a/tools/testing/selftests/bpf/map_tests/sk_storage_map.c
+++ b/tools/testing/selftests/bpf/map_tests/sk_storage_map.c
@@ -136,7 +136,7 @@ static int load_btf(void)
 	memcpy(raw_btf + sizeof(btf_hdr) + sizeof(btf_raw_types),
 	       btf_str_sec, sizeof(btf_str_sec));
 
-	return bpf_load_btf(raw_btf, sizeof(raw_btf), 0, 0, 0);
+	return bpf_btf_load(raw_btf, sizeof(raw_btf), NULL);
 }
 
 static int create_sk_storage_map(void)
diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c
index cab810bab593..01b776a7beeb 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf.c
@@ -4071,6 +4071,28 @@ done:
 	return raw_btf;
 }
 
+static int load_raw_btf(const void *raw_data, size_t raw_size)
+{
+	LIBBPF_OPTS(bpf_btf_load_opts, opts);
+	int btf_fd;
+
+	if (always_log) {
+		opts.log_buf = btf_log_buf,
+		opts.log_size = BTF_LOG_BUF_SIZE,
+		opts.log_level = 1;
+	}
+
+	btf_fd = bpf_btf_load(raw_data, raw_size, &opts);
+	if (btf_fd < 0 && !always_log) {
+		opts.log_buf = btf_log_buf,
+		opts.log_size = BTF_LOG_BUF_SIZE,
+		opts.log_level = 1;
+		btf_fd = bpf_btf_load(raw_data, raw_size, &opts);
+	}
+
+	return btf_fd;
+}
+
 static void do_test_raw(unsigned int test_num)
 {
 	struct btf_raw_test *test = &raw_tests[test_num - 1];
@@ -4100,16 +4122,14 @@ static void do_test_raw(unsigned int test_num)
 	hdr->str_len = (int)hdr->str_len + test->str_len_delta;
 
 	*btf_log_buf = '\0';
-	btf_fd = bpf_load_btf(raw_btf, raw_btf_size,
-			      btf_log_buf, BTF_LOG_BUF_SIZE,
-			      always_log);
+	btf_fd = load_raw_btf(raw_btf, raw_btf_size);
 	free(raw_btf);
 
 	err = ((btf_fd < 0) != test->btf_load_err);
 	if (CHECK(err, "btf_fd:%d test->btf_load_err:%u",
 		  btf_fd, test->btf_load_err) ||
 	    CHECK(test->err_str && !strstr(btf_log_buf, test->err_str),
-		  "expected err_str:%s", test->err_str)) {
+		  "expected err_str:%s\n", test->err_str)) {
 		err = -1;
 		goto done;
 	}
@@ -4227,9 +4247,7 @@ static int test_big_btf_info(unsigned int test_num)
 		goto done;
 	}
 
-	btf_fd = bpf_load_btf(raw_btf, raw_btf_size,
-			      btf_log_buf, BTF_LOG_BUF_SIZE,
-			      always_log);
+	btf_fd = load_raw_btf(raw_btf, raw_btf_size);
 	if (CHECK(btf_fd < 0, "errno:%d", errno)) {
 		err = -1;
 		goto done;
@@ -4315,9 +4333,7 @@ static int test_btf_id(unsigned int test_num)
 		info[i].btf_size = raw_btf_size;
 	}
 
-	btf_fd[0] = bpf_load_btf(raw_btf, raw_btf_size,
-				 btf_log_buf, BTF_LOG_BUF_SIZE,
-				 always_log);
+	btf_fd[0] = load_raw_btf(raw_btf, raw_btf_size);
 	if (CHECK(btf_fd[0] < 0, "errno:%d", errno)) {
 		err = -1;
 		goto done;
@@ -4447,9 +4463,7 @@ static void do_test_get_info(unsigned int test_num)
 		goto done;
 	}
 
-	btf_fd = bpf_load_btf(raw_btf, raw_btf_size,
-			      btf_log_buf, BTF_LOG_BUF_SIZE,
-			      always_log);
+	btf_fd = load_raw_btf(raw_btf, raw_btf_size);
 	if (CHECK(btf_fd <= 0, "errno:%d", errno)) {
 		err = -1;
 		goto done;
@@ -5169,12 +5183,10 @@ static void do_test_pprint(int test_num)
 		return;
 
 	*btf_log_buf = '\0';
-	btf_fd = bpf_load_btf(raw_btf, raw_btf_size,
-			      btf_log_buf, BTF_LOG_BUF_SIZE,
-			      always_log);
+	btf_fd = load_raw_btf(raw_btf, raw_btf_size);
 	free(raw_btf);
 
-	if (CHECK(btf_fd < 0, "errno:%d", errno)) {
+	if (CHECK(btf_fd < 0, "errno:%d\n", errno)) {
 		err = -1;
 		goto done;
 	}
@@ -6538,9 +6550,7 @@ static void do_test_info_raw(unsigned int test_num)
 		return;
 
 	*btf_log_buf = '\0';
-	btf_fd = bpf_load_btf(raw_btf, raw_btf_size,
-			      btf_log_buf, BTF_LOG_BUF_SIZE,
-			      always_log);
+	btf_fd = load_raw_btf(raw_btf, raw_btf_size);
 	free(raw_btf);
 
 	if (CHECK(btf_fd < 0, "invalid btf_fd errno:%d", errno)) {
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 222cb063ddf4..07b88a8f504f 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -641,7 +641,7 @@ static int load_btf(void)
 	memcpy(ptr, btf_str_sec, hdr.str_len);
 	ptr += hdr.str_len;
 
-	btf_fd = bpf_load_btf(raw_btf, ptr - raw_btf, 0, 0, 0);
+	btf_fd = bpf_btf_load(raw_btf, ptr - raw_btf, NULL);
 	free(raw_btf);
 	if (btf_fd < 0)
 		return -1;
-- 
cgit v1.2.3


From 57e889269af3dd0609933e2550c4baee7a7eb84c Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 9 Dec 2021 11:38:38 -0800
Subject: selftests/bpf: Add test for libbpf's custom log_buf behavior

Add a selftest that validates that per-program and per-object log_buf
overrides work as expected. Also test same logic for low-level
bpf_prog_load() and bpf_btf_load() APIs.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211209193840.1248570-11-andrii@kernel.org
---
 tools/testing/selftests/bpf/prog_tests/log_buf.c | 276 +++++++++++++++++++++++
 tools/testing/selftests/bpf/progs/test_log_buf.c |  24 ++
 2 files changed, 300 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/log_buf.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_log_buf.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/log_buf.c b/tools/testing/selftests/bpf/prog_tests/log_buf.c
new file mode 100644
index 000000000000..e469b023962b
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/log_buf.c
@@ -0,0 +1,276 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+
+#include <test_progs.h>
+#include <bpf/btf.h>
+
+#include "test_log_buf.skel.h"
+
+static size_t libbpf_log_pos;
+static char libbpf_log_buf[1024 * 1024];
+static bool libbpf_log_error;
+
+static int libbpf_print_cb(enum libbpf_print_level level, const char *fmt, va_list args)
+{
+	int emitted_cnt;
+	size_t left_cnt;
+
+	left_cnt = sizeof(libbpf_log_buf) - libbpf_log_pos;
+	emitted_cnt = vsnprintf(libbpf_log_buf + libbpf_log_pos, left_cnt, fmt, args);
+
+	if (emitted_cnt < 0 || emitted_cnt + 1 > left_cnt) {
+		libbpf_log_error = true;
+		return 0;
+	}
+
+	libbpf_log_pos += emitted_cnt;
+	return 0;
+}
+
+static void obj_load_log_buf(void)
+{
+	libbpf_print_fn_t old_print_cb = libbpf_set_print(libbpf_print_cb);
+	LIBBPF_OPTS(bpf_object_open_opts, opts);
+	const size_t log_buf_sz = 1024 * 1024;
+	struct test_log_buf* skel;
+	char *obj_log_buf, *good_log_buf, *bad_log_buf;
+	int err;
+
+	obj_log_buf = malloc(3 * log_buf_sz);
+	if (!ASSERT_OK_PTR(obj_log_buf, "obj_log_buf"))
+		return;
+
+	good_log_buf = obj_log_buf + log_buf_sz;
+	bad_log_buf = obj_log_buf + 2 * log_buf_sz;
+	obj_log_buf[0] = good_log_buf[0] = bad_log_buf[0] = '\0';
+
+	opts.kernel_log_buf = obj_log_buf;
+	opts.kernel_log_size = log_buf_sz;
+	opts.kernel_log_level = 4; /* for BTF this will turn into 1 */
+
+	/* In the first round every prog has its own log_buf, so libbpf logs
+	 * don't have program failure logs
+	 */
+	skel = test_log_buf__open_opts(&opts);
+	if (!ASSERT_OK_PTR(skel, "skel_open"))
+		goto cleanup;
+
+	/* set very verbose level for good_prog so we always get detailed logs */
+	bpf_program__set_log_buf(skel->progs.good_prog, good_log_buf, log_buf_sz);
+	bpf_program__set_log_level(skel->progs.good_prog, 2);
+
+	bpf_program__set_log_buf(skel->progs.bad_prog, bad_log_buf, log_buf_sz);
+	/* log_level 0 with custom log_buf means that verbose logs are not
+	 * requested if program load is successful, but libbpf should retry
+	 * with log_level 1 on error and put program's verbose load log into
+	 * custom log_buf
+	 */
+	bpf_program__set_log_level(skel->progs.bad_prog, 0);
+
+	err = test_log_buf__load(skel);
+	if (!ASSERT_ERR(err, "unexpected_load_success"))
+		goto cleanup;
+
+	ASSERT_FALSE(libbpf_log_error, "libbpf_log_error");
+
+	/* there should be no prog loading log because we specified per-prog log buf */
+	ASSERT_NULL(strstr(libbpf_log_buf, "-- BEGIN PROG LOAD LOG --"), "unexp_libbpf_log");
+	ASSERT_OK_PTR(strstr(libbpf_log_buf, "prog 'bad_prog': BPF program load failed"),
+		      "libbpf_log_not_empty");
+	ASSERT_OK_PTR(strstr(obj_log_buf, "DATASEC license"), "obj_log_not_empty");
+	ASSERT_OK_PTR(strstr(good_log_buf, "0: R1=ctx(id=0,off=0,imm=0) R10=fp0"),
+		      "good_log_verbose");
+	ASSERT_OK_PTR(strstr(bad_log_buf, "invalid access to map value, value_size=16 off=16000 size=4"),
+		      "bad_log_not_empty");
+
+	if (env.verbosity > VERBOSE_NONE) {
+		printf("LIBBPF LOG:   \n=================\n%s=================\n", libbpf_log_buf);
+		printf("OBJ LOG:      \n=================\n%s=================\n", obj_log_buf);
+		printf("GOOD_PROG LOG:\n=================\n%s=================\n", good_log_buf);
+		printf("BAD_PROG  LOG:\n=================\n%s=================\n", bad_log_buf);
+	}
+
+	/* reset everything */
+	test_log_buf__destroy(skel);
+	obj_log_buf[0] = good_log_buf[0] = bad_log_buf[0] = '\0';
+	libbpf_log_buf[0] = '\0';
+	libbpf_log_pos = 0;
+	libbpf_log_error = false;
+
+	/* In the second round we let bad_prog's failure be logged through print callback */
+	opts.kernel_log_buf = NULL; /* let everything through into print callback */
+	opts.kernel_log_size = 0;
+	opts.kernel_log_level = 1;
+
+	skel = test_log_buf__open_opts(&opts);
+	if (!ASSERT_OK_PTR(skel, "skel_open"))
+		goto cleanup;
+
+	/* set normal verbose level for good_prog to check log_level is taken into account */
+	bpf_program__set_log_buf(skel->progs.good_prog, good_log_buf, log_buf_sz);
+	bpf_program__set_log_level(skel->progs.good_prog, 1);
+
+	err = test_log_buf__load(skel);
+	if (!ASSERT_ERR(err, "unexpected_load_success"))
+		goto cleanup;
+
+	ASSERT_FALSE(libbpf_log_error, "libbpf_log_error");
+
+	/* this time prog loading error should be logged through print callback */
+	ASSERT_OK_PTR(strstr(libbpf_log_buf, "libbpf: prog 'bad_prog': -- BEGIN PROG LOAD LOG --"),
+		      "libbpf_log_correct");
+	ASSERT_STREQ(obj_log_buf, "", "obj_log__empty");
+	ASSERT_STREQ(good_log_buf, "processed 4 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0\n",
+		     "good_log_ok");
+	ASSERT_STREQ(bad_log_buf, "", "bad_log_empty");
+
+	if (env.verbosity > VERBOSE_NONE) {
+		printf("LIBBPF LOG:   \n=================\n%s=================\n", libbpf_log_buf);
+		printf("OBJ LOG:      \n=================\n%s=================\n", obj_log_buf);
+		printf("GOOD_PROG LOG:\n=================\n%s=================\n", good_log_buf);
+		printf("BAD_PROG  LOG:\n=================\n%s=================\n", bad_log_buf);
+	}
+
+cleanup:
+	free(obj_log_buf);
+	test_log_buf__destroy(skel);
+	libbpf_set_print(old_print_cb);
+}
+
+static void bpf_prog_load_log_buf(void)
+{
+	const struct bpf_insn good_prog_insns[] = {
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	};
+	const size_t good_prog_insn_cnt = sizeof(good_prog_insns) / sizeof(struct bpf_insn);
+	const struct bpf_insn bad_prog_insns[] = {
+		BPF_EXIT_INSN(),
+	};
+	size_t bad_prog_insn_cnt = sizeof(bad_prog_insns) / sizeof(struct bpf_insn);
+	LIBBPF_OPTS(bpf_prog_load_opts, opts);
+	const size_t log_buf_sz = 1024 * 1024;
+	char *log_buf;
+	int fd = -1;
+
+	log_buf = malloc(log_buf_sz);
+	if (!ASSERT_OK_PTR(log_buf, "log_buf_alloc"))
+		return;
+	opts.log_buf = log_buf;
+	opts.log_size = log_buf_sz;
+
+	/* with log_level == 0 log_buf shoud stay empty for good prog */
+	log_buf[0] = '\0';
+	opts.log_level = 0;
+	fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, "good_prog", "GPL",
+			   good_prog_insns, good_prog_insn_cnt, &opts);
+	ASSERT_STREQ(log_buf, "", "good_log_0");
+	ASSERT_GE(fd, 0, "good_fd1");
+	if (fd >= 0)
+		close(fd);
+	fd = -1;
+
+	/* log_level == 2 should always fill log_buf, even for good prog */
+	log_buf[0] = '\0';
+	opts.log_level = 2;
+	fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, "good_prog", "GPL",
+			   good_prog_insns, good_prog_insn_cnt, &opts);
+	ASSERT_OK_PTR(strstr(log_buf, "0: R1=ctx(id=0,off=0,imm=0) R10=fp0"), "good_log_2");
+	ASSERT_GE(fd, 0, "good_fd2");
+	if (fd >= 0)
+		close(fd);
+	fd = -1;
+
+	/* log_level == 0 should fill log_buf for bad prog */
+	log_buf[0] = '\0';
+	opts.log_level = 0;
+	fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, "bad_prog", "GPL",
+			   bad_prog_insns, bad_prog_insn_cnt, &opts);
+	ASSERT_OK_PTR(strstr(log_buf, "R0 !read_ok"), "bad_log_0");
+	ASSERT_LT(fd, 0, "bad_fd");
+	if (fd >= 0)
+		close(fd);
+	fd = -1;
+
+	free(log_buf);
+}
+
+static void bpf_btf_load_log_buf(void)
+{
+	LIBBPF_OPTS(bpf_btf_load_opts, opts);
+	const size_t log_buf_sz = 1024 * 1024;
+	const void *raw_btf_data;
+	__u32 raw_btf_size;
+	struct btf *btf;
+	char *log_buf;
+	int fd = -1;
+
+	btf = btf__new_empty();
+	if (!ASSERT_OK_PTR(btf, "empty_btf"))
+		return;
+
+	ASSERT_GT(btf__add_int(btf, "int", 4, 0), 0, "int_type");
+
+	raw_btf_data = btf__raw_data(btf, &raw_btf_size);
+	if (!ASSERT_OK_PTR(raw_btf_data, "raw_btf_data_good"))
+		goto cleanup;
+
+	log_buf = malloc(log_buf_sz);
+	if (!ASSERT_OK_PTR(log_buf, "log_buf_alloc"))
+		goto cleanup;
+	opts.log_buf = log_buf;
+	opts.log_size = log_buf_sz;
+
+	/* with log_level == 0 log_buf shoud stay empty for good BTF */
+	log_buf[0] = '\0';
+	opts.log_level = 0;
+	fd = bpf_btf_load(raw_btf_data, raw_btf_size, &opts);
+	ASSERT_STREQ(log_buf, "", "good_log_0");
+	ASSERT_GE(fd, 0, "good_fd1");
+	if (fd >= 0)
+		close(fd);
+	fd = -1;
+
+	/* log_level == 2 should always fill log_buf, even for good BTF */
+	log_buf[0] = '\0';
+	opts.log_level = 2;
+	fd = bpf_btf_load(raw_btf_data, raw_btf_size, &opts);
+	printf("LOG_BUF: %s\n", log_buf);
+	ASSERT_OK_PTR(strstr(log_buf, "magic: 0xeb9f"), "good_log_2");
+	ASSERT_GE(fd, 0, "good_fd2");
+	if (fd >= 0)
+		close(fd);
+	fd = -1;
+
+	/* make BTF bad, add pointer pointing to non-existing type */
+	ASSERT_GT(btf__add_ptr(btf, 100), 0, "bad_ptr_type");
+
+	raw_btf_data = btf__raw_data(btf, &raw_btf_size);
+	if (!ASSERT_OK_PTR(raw_btf_data, "raw_btf_data_bad"))
+		goto cleanup;
+
+	/* log_level == 0 should fill log_buf for bad BTF */
+	log_buf[0] = '\0';
+	opts.log_level = 0;
+	fd = bpf_btf_load(raw_btf_data, raw_btf_size, &opts);
+	printf("LOG_BUF: %s\n", log_buf);
+	ASSERT_OK_PTR(strstr(log_buf, "[2] PTR (anon) type_id=100 Invalid type_id"), "bad_log_0");
+	ASSERT_LT(fd, 0, "bad_fd");
+	if (fd >= 0)
+		close(fd);
+	fd = -1;
+
+cleanup:
+	free(log_buf);
+	btf__free(btf);
+}
+
+void test_log_buf(void)
+{
+	if (test__start_subtest("obj_load_log_buf"))
+		obj_load_log_buf();
+	if (test__start_subtest("bpf_prog_load_log_buf"))
+		bpf_prog_load_log_buf();
+	if (test__start_subtest("bpf_btf_load_log_buf"))
+		bpf_btf_load_log_buf();
+}
diff --git a/tools/testing/selftests/bpf/progs/test_log_buf.c b/tools/testing/selftests/bpf/progs/test_log_buf.c
new file mode 100644
index 000000000000..199f459bd5ae
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_log_buf.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+int a[4];
+const volatile int off = 4000;
+
+SEC("raw_tp/sys_enter")
+int good_prog(const void *ctx)
+{
+	a[0] = (int)(long)ctx;
+	return a[1];
+}
+
+SEC("raw_tp/sys_enter")
+int bad_prog(const void *ctx)
+{
+	/* out of bounds access */
+	return a[off];
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 3fc5fdcca144badbaf29b62aacbf7877f2f39a74 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 9 Dec 2021 11:38:39 -0800
Subject: selftests/bpf: Remove the only use of deprecated
 bpf_object__load_xattr()

Switch from bpf_object__load_xattr() to bpf_object__load() and
kernel_log_level in bpf_object_open_opts.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211209193840.1248570-12-andrii@kernel.org
---
 tools/testing/selftests/bpf/testing_helpers.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c
index 0f1c37ac6f2c..795b6798ccee 100644
--- a/tools/testing/selftests/bpf/testing_helpers.c
+++ b/tools/testing/selftests/bpf/testing_helpers.c
@@ -88,13 +88,15 @@ int extra_prog_load_log_flags = 0;
 int bpf_prog_test_load(const char *file, enum bpf_prog_type type,
 		       struct bpf_object **pobj, int *prog_fd)
 {
-	struct bpf_object_load_attr attr = {};
+	LIBBPF_OPTS(bpf_object_open_opts, opts,
+		.kernel_log_level = extra_prog_load_log_flags,
+	);
 	struct bpf_object *obj;
 	struct bpf_program *prog;
 	__u32 flags;
 	int err;
 
-	obj = bpf_object__open(file);
+	obj = bpf_object__open_file(file, &opts);
 	if (!obj)
 		return -errno;
 
@@ -110,9 +112,7 @@ int bpf_prog_test_load(const char *file, enum bpf_prog_type type,
 	flags = bpf_program__flags(prog) | BPF_F_TEST_RND_HI32;
 	bpf_program__set_flags(prog, flags);
 
-	attr.obj = obj;
-	attr.log_level = extra_prog_load_log_flags;
-	err = bpf_object__load_xattr(&attr);
+	err = bpf_object__load(obj);
 	if (err)
 		goto err_out;
 
-- 
cgit v1.2.3


From b59e4ce8bcaab6445f4a0d37a96ca8953caaf5cf Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 9 Dec 2021 11:38:40 -0800
Subject: bpftool: Switch bpf_object__load_xattr() to bpf_object__load()

Switch all the uses of to-be-deprecated bpf_object__load_xattr() into
a simple bpf_object__load() calls with optional log_level passed through
open_opts.kernel_log_level, if -d option is specified.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211209193840.1248570-13-andrii@kernel.org
---
 tools/bpf/bpftool/gen.c        | 11 ++++-------
 tools/bpf/bpftool/prog.c       | 24 ++++++++++--------------
 tools/bpf/bpftool/struct_ops.c | 15 +++++++--------
 3 files changed, 21 insertions(+), 29 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c
index 997a2865e04a..b4695df2ea3d 100644
--- a/tools/bpf/bpftool/gen.c
+++ b/tools/bpf/bpftool/gen.c
@@ -486,7 +486,6 @@ static void codegen_destroy(struct bpf_object *obj, const char *obj_name)
 
 static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *header_guard)
 {
-	struct bpf_object_load_attr load_attr = {};
 	DECLARE_LIBBPF_OPTS(gen_loader_opts, opts);
 	struct bpf_map *map;
 	char ident[256];
@@ -496,12 +495,7 @@ static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *h
 	if (err)
 		return err;
 
-	load_attr.obj = obj;
-	if (verifier_logs)
-		/* log_level1 + log_level2 + stats, but not stable UAPI */
-		load_attr.log_level = 1 + 2 + 4;
-
-	err = bpf_object__load_xattr(&load_attr);
+	err = bpf_object__load(obj);
 	if (err) {
 		p_err("failed to load object file");
 		goto out;
@@ -719,6 +713,9 @@ static int do_skeleton(int argc, char **argv)
 	if (obj_name[0] == '\0')
 		get_obj_name(obj_name, file);
 	opts.object_name = obj_name;
+	if (verifier_logs)
+		/* log_level1 + log_level2 + stats, but not stable UAPI */
+		opts.kernel_log_level = 1 + 2 + 4;
 	obj = bpf_object__open_mem(obj_data, file_sz, &opts);
 	err = libbpf_get_error(obj);
 	if (err) {
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 45ccc254e69f..f874896c4154 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -1464,7 +1464,6 @@ static int load_with_options(int argc, char **argv, bool first_prog_only)
 	DECLARE_LIBBPF_OPTS(bpf_object_open_opts, open_opts,
 		.relaxed_maps = relaxed_maps,
 	);
-	struct bpf_object_load_attr load_attr = { 0 };
 	enum bpf_attach_type expected_attach_type;
 	struct map_replace *map_replace = NULL;
 	struct bpf_program *prog = NULL, *pos;
@@ -1598,6 +1597,10 @@ static int load_with_options(int argc, char **argv, bool first_prog_only)
 
 	set_max_rlimit();
 
+	if (verifier_logs)
+		/* log_level1 + log_level2 + stats, but not stable UAPI */
+		open_opts.kernel_log_level = 1 + 2 + 4;
+
 	obj = bpf_object__open_file(file, &open_opts);
 	if (libbpf_get_error(obj)) {
 		p_err("failed to open object file");
@@ -1677,12 +1680,7 @@ static int load_with_options(int argc, char **argv, bool first_prog_only)
 		goto err_close_obj;
 	}
 
-	load_attr.obj = obj;
-	if (verifier_logs)
-		/* log_level1 + log_level2 + stats, but not stable UAPI */
-		load_attr.log_level = 1 + 2 + 4;
-
-	err = bpf_object__load_xattr(&load_attr);
+	err = bpf_object__load(obj);
 	if (err) {
 		p_err("failed to load object file");
 		goto err_close_obj;
@@ -1809,7 +1807,6 @@ static int do_loader(int argc, char **argv)
 {
 	DECLARE_LIBBPF_OPTS(bpf_object_open_opts, open_opts);
 	DECLARE_LIBBPF_OPTS(gen_loader_opts, gen);
-	struct bpf_object_load_attr load_attr = {};
 	struct bpf_object *obj;
 	const char *file;
 	int err = 0;
@@ -1818,6 +1815,10 @@ static int do_loader(int argc, char **argv)
 		return -1;
 	file = GET_ARG();
 
+	if (verifier_logs)
+		/* log_level1 + log_level2 + stats, but not stable UAPI */
+		open_opts.kernel_log_level = 1 + 2 + 4;
+
 	obj = bpf_object__open_file(file, &open_opts);
 	if (libbpf_get_error(obj)) {
 		p_err("failed to open object file");
@@ -1828,12 +1829,7 @@ static int do_loader(int argc, char **argv)
 	if (err)
 		goto err_close_obj;
 
-	load_attr.obj = obj;
-	if (verifier_logs)
-		/* log_level1 + log_level2 + stats, but not stable UAPI */
-		load_attr.log_level = 1 + 2 + 4;
-
-	err = bpf_object__load_xattr(&load_attr);
+	err = bpf_object__load(obj);
 	if (err) {
 		p_err("failed to load object file");
 		goto err_close_obj;
diff --git a/tools/bpf/bpftool/struct_ops.c b/tools/bpf/bpftool/struct_ops.c
index cbdca37a53f0..2f693b082bdb 100644
--- a/tools/bpf/bpftool/struct_ops.c
+++ b/tools/bpf/bpftool/struct_ops.c
@@ -479,7 +479,7 @@ static int do_unregister(int argc, char **argv)
 
 static int do_register(int argc, char **argv)
 {
-	struct bpf_object_load_attr load_attr = {};
+	LIBBPF_OPTS(bpf_object_open_opts, open_opts);
 	const struct bpf_map_def *def;
 	struct bpf_map_info info = {};
 	__u32 info_len = sizeof(info);
@@ -494,18 +494,17 @@ static int do_register(int argc, char **argv)
 
 	file = GET_ARG();
 
-	obj = bpf_object__open(file);
+	if (verifier_logs)
+		/* log_level1 + log_level2 + stats, but not stable UAPI */
+		open_opts.kernel_log_level = 1 + 2 + 4;
+
+	obj = bpf_object__open_file(file, &open_opts);
 	if (libbpf_get_error(obj))
 		return -1;
 
 	set_max_rlimit();
 
-	load_attr.obj = obj;
-	if (verifier_logs)
-		/* log_level1 + log_level2 + stats, but not stable UAPI */
-		load_attr.log_level = 1 + 2 + 4;
-
-	if (bpf_object__load_xattr(&load_attr)) {
+	if (bpf_object__load(obj)) {
 		bpf_object__close(obj);
 		return -1;
 	}
-- 
cgit v1.2.3


From b69c5c07a66ee569b8ccdc0cb567fe0622c89ea5 Mon Sep 17 00:00:00 2001
From: Vincent Minet <vincent@vincent-minet.net>
Date: Fri, 10 Dec 2021 07:31:12 +0100
Subject: libbpf: Fix typo in btf__dedup@LIBBPF_0.0.2 definition

The btf__dedup_deprecated name was misspelled in the definition of the
compat symbol for btf__dedup. This leads it to be missing from the
shared library.

This fixes it.

Fixes: 957d350a8b94 ("libbpf: Turn btf_dedup_opts into OPTS-based struct")
Signed-off-by: Vincent Minet <vincent@vincent-minet.net>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211210063112.80047-1-vincent@vincent-minet.net
---
 tools/lib/bpf/btf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index e171424192ae..9aa19c89f758 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -3107,7 +3107,7 @@ done:
 	return libbpf_err(err);
 }
 
-COMPAT_VERSION(bpf__dedup_deprecated, btf__dedup, LIBBPF_0.0.2)
+COMPAT_VERSION(btf__dedup_deprecated, btf__dedup, LIBBPF_0.0.2)
 int btf__dedup_deprecated(struct btf *btf, struct btf_ext *btf_ext, const void *unused_opts)
 {
 	LIBBPF_OPTS(btf_dedup_opts, opts, .btf_ext = btf_ext);
-- 
cgit v1.2.3


From 229fae38d0fc0d6ff58d57cbeb1432da55e58d4f Mon Sep 17 00:00:00 2001
From: Shuyi Cheng <chengshuyi@linux.alibaba.com>
Date: Fri, 10 Dec 2021 17:39:57 +0800
Subject: libbpf: Add "bool skipped" to struct bpf_map
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix error: "failed to pin map: Bad file descriptor, path:
/sys/fs/bpf/_rodata_str1_1."

In the old kernel, the global data map will not be created, see [0]. So
we should skip the pinning of the global data map to avoid
bpf_object__pin_maps returning error. Therefore, when the map is not
created, we mark “map->skipped" as true and then check during relocation
and during pinning.

Fixes: 16e0c35c6f7a ("libbpf: Load global data maps lazily on legacy kernels")
Signed-off-by: Shuyi Cheng <chengshuyi@linux.alibaba.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/lib/bpf/libbpf.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 18d95c6a89fe..d027e1d620fc 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -431,6 +431,7 @@ struct bpf_map {
 	char *pin_path;
 	bool pinned;
 	bool reused;
+	bool skipped;
 	__u64 map_extra;
 };
 
@@ -5087,8 +5088,10 @@ bpf_object__create_maps(struct bpf_object *obj)
 		 * kernels.
 		 */
 		if (bpf_map__is_internal(map) &&
-		    !kernel_supports(obj, FEAT_GLOBAL_DATA))
+		    !kernel_supports(obj, FEAT_GLOBAL_DATA)) {
+			map->skipped = true;
 			continue;
+		}
 
 		retried = false;
 retry:
@@ -5717,8 +5720,7 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog)
 			} else {
 				const struct bpf_map *map = &obj->maps[relo->map_idx];
 
-				if (bpf_map__is_internal(map) &&
-				    !kernel_supports(obj, FEAT_GLOBAL_DATA)) {
+				if (map->skipped) {
 					pr_warn("prog '%s': relo #%d: kernel doesn't support global data\n",
 						prog->name, i);
 					return -ENOTSUP;
@@ -7926,6 +7928,9 @@ int bpf_object__pin_maps(struct bpf_object *obj, const char *path)
 		char *pin_path = NULL;
 		char buf[PATH_MAX];
 
+		if (map->skipped)
+			continue;
+
 		if (path) {
 			int len;
 
-- 
cgit v1.2.3


From 7b0653eca4cf7d57de84379bd81441d574901ddb Mon Sep 17 00:00:00 2001
From: Zhang Mingyu <zhang.mingyu@zte.com.cn>
Date: Fri, 5 Nov 2021 01:35:47 +0000
Subject: kselftests: timers:Remove unneeded semicolon

Eliminate the following coccinelle check warning:
tools/testing/selftests/timers/alarmtimer-suspend.c:82:2-3

Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: Zhang Mingyu <zhang.mingyu@zte.com.cn>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/timers/alarmtimer-suspend.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/timers/alarmtimer-suspend.c b/tools/testing/selftests/timers/alarmtimer-suspend.c
index 4da09dbf83ba..54da4b088f4c 100644
--- a/tools/testing/selftests/timers/alarmtimer-suspend.c
+++ b/tools/testing/selftests/timers/alarmtimer-suspend.c
@@ -79,7 +79,7 @@ char *clockstring(int clockid)
 		return "CLOCK_BOOTTIME_ALARM";
 	case CLOCK_TAI:
 		return "CLOCK_TAI";
-	};
+	}
 	return "UNKNOWN_CLOCKID";
 }
 
-- 
cgit v1.2.3


From 7ace3e9ae0492a6b33f5af8b8e1a6f1e0390cc15 Mon Sep 17 00:00:00 2001
From: Zhang Mingyu <zhang.mingyu@zte.com.cn>
Date: Fri, 5 Nov 2021 01:42:08 +0000
Subject: selftests: timers: Remove unneeded semicolon

Eliminate the following coccinelle check warning:
tools/testing/selftests/timers/inconsistency-check.c:75:2-3

Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: Zhang Mingyu <zhang.mingyu@zte.com.cn>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/timers/inconsistency-check.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/timers/inconsistency-check.c b/tools/testing/selftests/timers/inconsistency-check.c
index 022d3ffe3fbf..e6756d9c60a7 100644
--- a/tools/testing/selftests/timers/inconsistency-check.c
+++ b/tools/testing/selftests/timers/inconsistency-check.c
@@ -72,7 +72,7 @@ char *clockstring(int clockid)
 		return "CLOCK_BOOTTIME_ALARM";
 	case CLOCK_TAI:
 		return "CLOCK_TAI";
-	};
+	}
 	return "UNKNOWN_CLOCKID";
 }
 
-- 
cgit v1.2.3


From a531b0c23c0fc68ad758cc31a74cf612a4dafeb0 Mon Sep 17 00:00:00 2001
From: Anders Roxell <anders.roxell@linaro.org>
Date: Wed, 3 Nov 2021 21:13:50 +0100
Subject: selftests: clone3: clone3: add case CLONE3_ARGS_NO_TEST

Building selftests/clone3 with clang warns about enumeration not handled
in switch case:

clone3.c:54:10: warning: enumeration value 'CLONE3_ARGS_NO_TEST' not handled in switch [-Wswitch]
        switch (test_mode) {
                ^

Add the missing switch case with a comment.

Fixes: 17a810699c18 ("selftests: add tests for clone3()")
Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/clone3/clone3.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/clone3/clone3.c b/tools/testing/selftests/clone3/clone3.c
index 42be3b925830..076cf4325f78 100644
--- a/tools/testing/selftests/clone3/clone3.c
+++ b/tools/testing/selftests/clone3/clone3.c
@@ -52,6 +52,12 @@ static int call_clone3(uint64_t flags, size_t size, enum test_mode test_mode)
 		size = sizeof(struct __clone_args);
 
 	switch (test_mode) {
+	case CLONE3_ARGS_NO_TEST:
+		/*
+		 * Uses default 'flags' and 'SIGCHLD'
+		 * assignment.
+		 */
+		break;
 	case CLONE3_ARGS_ALL_0:
 		args.flags = 0;
 		args.exit_signal = 0;
-- 
cgit v1.2.3


From e5992f373c6eed6d09e5858e9623df1259b3ce30 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Tue, 30 Nov 2021 13:25:46 +0100
Subject: selftests/ftrace: make kprobe profile testcase description unique

Commit 32f6e5da83c7 ("selftests/ftrace: Add kprobe profile testcase")
added a new kprobes testcase, but has a description which does not
describe what the test case is doing and is duplicating the description
of another test case.

Therefore change the test case description, so it is unique and then
allows easily to tell which test case actually passed or failed.

Reported-by: Alexander Egorenkov <egorenar@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/ftrace/test.d/kprobe/profile.tc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/profile.tc b/tools/testing/selftests/ftrace/test.d/kprobe/profile.tc
index 98166fa3eb91..34fb89b0c61f 100644
--- a/tools/testing/selftests/ftrace/test.d/kprobe/profile.tc
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/profile.tc
@@ -1,6 +1,6 @@
 #!/bin/sh
 # SPDX-License-Identifier: GPL-2.0
-# description: Kprobe dynamic event - adding and removing
+# description: Kprobe profile
 # requires: kprobe_events
 
 ! grep -q 'myevent' kprobe_profile
-- 
cgit v1.2.3


From 3abedf4646fdc0036fcb8ebbc3b600667167fafe Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 24 Nov 2021 14:39:16 -0800
Subject: selftests: harness: avoid false negatives if test has no ASSERTs

Test can fail either immediately when ASSERT() failed or at the
end if one or more EXPECT() was not met. The exact return code
is decided based on the number of successful ASSERT()s.

If test has no ASSERT()s, however, the return code will be 0,
as if the test did not fail. Start counting ASSERT()s from 1.

Fixes: 369130b63178 ("selftests: Enhance kselftest_harness.h to print which assert failed")
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/kselftest_harness.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h
index ae0f0f33b2a6..79a182cfa43a 100644
--- a/tools/testing/selftests/kselftest_harness.h
+++ b/tools/testing/selftests/kselftest_harness.h
@@ -969,7 +969,7 @@ void __run_test(struct __fixture_metadata *f,
 	t->passed = 1;
 	t->skip = 0;
 	t->trigger = 0;
-	t->step = 0;
+	t->step = 1;
 	t->no_print = 0;
 	memset(t->results->reason, 0, sizeof(t->results->reason));
 
-- 
cgit v1.2.3


From 6d425d7c1bec67eceee42fefe5ce00be0645482e Mon Sep 17 00:00:00 2001
From: Yang Guang <yang.guang5@zte.com.cn>
Date: Thu, 25 Nov 2021 08:56:03 +0800
Subject: selftests/mount: remove unneeded conversion to bool

The coccinelle report
./tools/testing/selftests/mount/unprivileged-remount-test.c:285:54-59:
WARNING: conversion to bool not needed here
./tools/testing/selftests/mount/unprivileged-remount-test.c:207:54-59:
WARNING: conversion to bool not needed here
Relational and logical operators evaluate to bool,
explicit conversion is overly verbose and unneeded.

Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: Yang Guang <yang.guang5@zte.com.cn>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/mount/unprivileged-remount-test.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mount/unprivileged-remount-test.c b/tools/testing/selftests/mount/unprivileged-remount-test.c
index 584dc6bc3b06..d2917054fe3a 100644
--- a/tools/testing/selftests/mount/unprivileged-remount-test.c
+++ b/tools/testing/selftests/mount/unprivileged-remount-test.c
@@ -204,7 +204,7 @@ bool test_unpriv_remount(const char *fstype, const char *mount_options,
 		if (!WIFEXITED(status)) {
 			die("child did not terminate cleanly\n");
 		}
-		return WEXITSTATUS(status) == EXIT_SUCCESS ? true : false;
+		return WEXITSTATUS(status) == EXIT_SUCCESS;
 	}
 
 	create_and_enter_userns();
@@ -282,7 +282,7 @@ static bool test_priv_mount_unpriv_remount(void)
 		if (!WIFEXITED(status)) {
 			die("child did not terminate cleanly\n");
 		}
-		return WEXITSTATUS(status) == EXIT_SUCCESS ? true : false;
+		return WEXITSTATUS(status) == EXIT_SUCCESS;
 	}
 
 	orig_mnt_flags = read_mnt_flags(orig_path);
-- 
cgit v1.2.3


From 009482c0932ae2420277dc0adaefa5bd51cb433e Mon Sep 17 00:00:00 2001
From: Yang Guang <yang.guang5@zte.com.cn>
Date: Mon, 6 Dec 2021 08:07:23 +0800
Subject: selftests/move_mount_set_group remove unneeded conversion to bool

The coccinelle report
./tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c:225:18-23:
WARNING: conversion to bool not needed here
Relational and logical operators evaluate to bool,
explicit conversion is overly verbose and unneeded.

Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: Yang Guang <yang.guang5@zte.com.cn>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 .../selftests/move_mount_set_group/move_mount_set_group_test.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c b/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c
index 860198f83a53..50ed5d475dd1 100644
--- a/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c
+++ b/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c
@@ -191,7 +191,7 @@ static bool is_shared_mount(const char *path)
 #define SET_GROUP_FROM	"/tmp/move_mount_set_group_supported_from"
 #define SET_GROUP_TO	"/tmp/move_mount_set_group_supported_to"
 
-static int move_mount_set_group_supported(void)
+static bool move_mount_set_group_supported(void)
 {
 	int ret;
 
@@ -222,7 +222,7 @@ static int move_mount_set_group_supported(void)
 		      AT_FDCWD, SET_GROUP_TO, MOVE_MOUNT_SET_GROUP);
 	umount2("/tmp", MNT_DETACH);
 
-	return ret < 0 ? false : true;
+	return ret >= 0;
 }
 
 FIXTURE(move_mount_set_group) {
@@ -232,7 +232,7 @@ FIXTURE(move_mount_set_group) {
 
 FIXTURE_SETUP(move_mount_set_group)
 {
-	int ret;
+	bool ret;
 
 	ASSERT_EQ(prepare_unpriv_mountns(), 0);
 
@@ -254,7 +254,7 @@ FIXTURE_SETUP(move_mount_set_group)
 
 FIXTURE_TEARDOWN(move_mount_set_group)
 {
-	int ret;
+	bool ret;
 
 	ret = move_mount_set_group_supported();
 	ASSERT_GE(ret, 0);
@@ -348,7 +348,7 @@ TEST_F(move_mount_set_group, complex_sharing_copying)
 		.shared = false,
 	};
 	pid_t pid;
-	int ret;
+	bool ret;
 
 	ret = move_mount_set_group_supported();
 	ASSERT_GE(ret, 0);
-- 
cgit v1.2.3


From a738a4ce8421b2187fa5b74bb475eeeb6ec9ac14 Mon Sep 17 00:00:00 2001
From: Anders Roxell <anders.roxell@linaro.org>
Date: Fri, 5 Nov 2021 17:25:29 +0100
Subject: selftests: cgroup: build error multiple outpt files

When building selftests/cgroup: with clang the following error are seen:

clang -Wall -pthread    test_memcontrol.c cgroup_util.c ../clone3/clone3_selftests.h  -o .../builds/current/kselftest/cgroup/test_memcontrol
clang: error: cannot specify -o when generating multiple output files
make[3]: *** [../lib.mk:146: .../builds/current/kselftest/cgroup/test_memcontrol] Error 1

Rework to add the header files to LOCAL_HDRS before including ../lib.mk,
since the dependency is evaluated in '$(OUTPUT)/%:%.c $(LOCAL_HDRS)' in
file lib.mk.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/cgroup/Makefile | 12 +++++++-----
 tools/testing/selftests/lib.mk          |  2 +-
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile
index 59e222460581..745fe25fa0b9 100644
--- a/tools/testing/selftests/cgroup/Makefile
+++ b/tools/testing/selftests/cgroup/Makefile
@@ -11,10 +11,12 @@ TEST_GEN_PROGS += test_core
 TEST_GEN_PROGS += test_freezer
 TEST_GEN_PROGS += test_kill
 
+LOCAL_HDRS += $(selfdir)/clone3/clone3_selftests.h $(selfdir)/pidfd/pidfd.h
+
 include ../lib.mk
 
-$(OUTPUT)/test_memcontrol: cgroup_util.c ../clone3/clone3_selftests.h
-$(OUTPUT)/test_kmem: cgroup_util.c ../clone3/clone3_selftests.h
-$(OUTPUT)/test_core: cgroup_util.c ../clone3/clone3_selftests.h
-$(OUTPUT)/test_freezer: cgroup_util.c ../clone3/clone3_selftests.h
-$(OUTPUT)/test_kill: cgroup_util.c ../clone3/clone3_selftests.h ../pidfd/pidfd.h
+$(OUTPUT)/test_memcontrol: cgroup_util.c
+$(OUTPUT)/test_kmem: cgroup_util.c
+$(OUTPUT)/test_core: cgroup_util.c
+$(OUTPUT)/test_freezer: cgroup_util.c
+$(OUTPUT)/test_kill: cgroup_util.c
diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk
index fe7ee2b0f29c..a40add31a2e3 100644
--- a/tools/testing/selftests/lib.mk
+++ b/tools/testing/selftests/lib.mk
@@ -141,7 +141,7 @@ endif
 # Selftest makefiles can override those targets by setting
 # OVERRIDE_TARGETS = 1.
 ifeq ($(OVERRIDE_TARGETS),)
-LOCAL_HDRS := $(selfdir)/kselftest_harness.h $(selfdir)/kselftest.h
+LOCAL_HDRS += $(selfdir)/kselftest_harness.h $(selfdir)/kselftest.h
 $(OUTPUT)/%:%.c $(LOCAL_HDRS)
 	$(LINK.c) $(filter-out $(LOCAL_HDRS),$^) $(LDLIBS) -o $@
 
-- 
cgit v1.2.3


From 066b34aa5461f6072dbbecb690f4fe446b736ebf Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Wed, 8 Dec 2021 10:47:42 -0700
Subject: tools: fix ARRAY_SIZE defines in tools and selftests hdrs

tools/include/linux/kernel.h and kselftest_harness.h are missing
ifndef guard around ARRAY_SIZE define. Fix them to avoid duplicate
define errors during compile when another file defines it. This
problem was found when compiling selftests that include a header
with ARRAY_SIZE define.

ARRAY_SIZE is defined in several selftests. There are about 25+
duplicate defines in various selftests source and header files.
Add ARRAY_SIZE to kselftest.h in preparation for removing duplicate
ARRAY_SIZE defines from individual test files.

Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/include/linux/kernel.h                | 2 ++
 tools/testing/selftests/kselftest.h         | 4 ++++
 tools/testing/selftests/kselftest_harness.h | 2 ++
 3 files changed, 8 insertions(+)

(limited to 'tools')

diff --git a/tools/include/linux/kernel.h b/tools/include/linux/kernel.h
index 3e8df500cfbd..9701e8307db0 100644
--- a/tools/include/linux/kernel.h
+++ b/tools/include/linux/kernel.h
@@ -92,7 +92,9 @@ int vscnprintf(char *buf, size_t size, const char *fmt, va_list args);
 int scnprintf(char * buf, size_t size, const char * fmt, ...);
 int scnprintf_pad(char * buf, size_t size, const char * fmt, ...);
 
+#ifndef ARRAY_SIZE
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr))
+#endif
 
 #define current_gfp_context(k) 0
 #define synchronize_rcu()
diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h
index 8d50483fe204..f1180987492c 100644
--- a/tools/testing/selftests/kselftest.h
+++ b/tools/testing/selftests/kselftest.h
@@ -48,6 +48,10 @@
 #include <stdarg.h>
 #include <stdio.h>
 
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+#endif
+
 /* define kselftest exit codes */
 #define KSFT_PASS  0
 #define KSFT_FAIL  1
diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h
index 79a182cfa43a..471eaa7b3a3f 100644
--- a/tools/testing/selftests/kselftest_harness.h
+++ b/tools/testing/selftests/kselftest_harness.h
@@ -671,7 +671,9 @@
 #define EXPECT_STRNE(expected, seen) \
 	__EXPECT_STR(expected, seen, !=, 0)
 
+#ifndef ARRAY_SIZE
 #define ARRAY_SIZE(a)	(sizeof(a) / sizeof(a[0]))
+#endif
 
 /* Support an optional handler after and ASSERT_* or EXPECT_*.  The approach is
  * not thread-safe, but it should be fine in most sane test scenarios.
-- 
cgit v1.2.3


From 2684618b61189544aa4a539fb5df9eb074de8ce6 Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Thu, 9 Dec 2021 12:36:24 -0700
Subject: selftests/arm64: remove ARRAY_SIZE define from vec-syscfg.c

ARRAY_SIZE is defined in several selftests. Remove definitions from
individual test files and include header file for the define instead.
ARRAY_SIZE define is added in a separate patch to prepare for this
change.

Remove ARRAY_SIZE from vec-syscfg.c and pickup the one defined in
kselftest.h.

Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/arm64/fp/vec-syscfg.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/fp/vec-syscfg.c b/tools/testing/selftests/arm64/fp/vec-syscfg.c
index 272b888e018e..c90658811a83 100644
--- a/tools/testing/selftests/arm64/fp/vec-syscfg.c
+++ b/tools/testing/selftests/arm64/fp/vec-syscfg.c
@@ -21,8 +21,6 @@
 #include "../../kselftest.h"
 #include "rdvl.h"
 
-#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0]))
-
 #define ARCH_MIN_VL SVE_VL_MIN
 
 struct vec_data {
-- 
cgit v1.2.3


From 72a571d1e25f732ae7d50b9566ad68ce87c733d9 Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Thu, 9 Dec 2021 12:49:13 -0700
Subject: selftests/cgroup: remove ARRAY_SIZE define from cgroup_util.h

ARRAY_SIZE is defined in several selftests. Remove definitions from
individual test files and include header file for the define instead.
ARRAY_SIZE define is added in a separate patch to prepare for this
change.

Remove ARRAY_SIZE from cgroup_util.h and pickup the one defined in
kselftest.h.

Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/cgroup/cgroup_util.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/cgroup/cgroup_util.h b/tools/testing/selftests/cgroup/cgroup_util.h
index 82e59cdf16e7..4f66d10626d2 100644
--- a/tools/testing/selftests/cgroup/cgroup_util.h
+++ b/tools/testing/selftests/cgroup/cgroup_util.h
@@ -2,9 +2,9 @@
 #include <stdbool.h>
 #include <stdlib.h>
 
-#define PAGE_SIZE 4096
+#include "../kselftest.h"
 
-#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+#define PAGE_SIZE 4096
 
 #define MB(x) (x << 20)
 
-- 
cgit v1.2.3


From fc1d330358423f5f812b3c647bf23e25cae5ab15 Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Thu, 9 Dec 2021 12:57:14 -0700
Subject: selftests/core: remove ARRAY_SIZE define from close_range_test.c

ARRAY_SIZE is defined in several selftests. Remove definitions from
individual test files and include header file for the define instead.
ARRAY_SIZE define is added in a separate patch to prepare for this
change.

Remove ARRAY_SIZE from close_range_test.c and pickup the one defined
in kselftest.h.

Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/core/close_range_test.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c
index aa7d13d91963..749239930ca8 100644
--- a/tools/testing/selftests/core/close_range_test.c
+++ b/tools/testing/selftests/core/close_range_test.c
@@ -50,10 +50,6 @@ static inline int sys_close_range(unsigned int fd, unsigned int max_fd,
 	return syscall(__NR_close_range, fd, max_fd, flags);
 }
 
-#ifndef ARRAY_SIZE
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-#endif
-
 TEST(core_close_range)
 {
 	int i, ret;
-- 
cgit v1.2.3


From 8eda7963235d0c2d6f926dea951f071bf9aa730f Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Thu, 9 Dec 2021 13:58:49 -0700
Subject: selftests/ir: remove ARRAY_SIZE define from ir_loopback.c

ARRAY_SIZE is defined in several selftests. Remove definitions from
individual test files and include header file for the define instead.
ARRAY_SIZE define is added in a separate patch to prepare for this
change.

Remove ARRAY_SIZE from ir_loopback.c and pickup the one defined
in kselftest.h.

Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/ir/ir_loopback.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ir/ir_loopback.c b/tools/testing/selftests/ir/ir_loopback.c
index af7f9c7d59bc..06256c96df12 100644
--- a/tools/testing/selftests/ir/ir_loopback.c
+++ b/tools/testing/selftests/ir/ir_loopback.c
@@ -26,7 +26,6 @@
 #include "../kselftest.h"
 
 #define TEST_SCANCODES	10
-#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
 #define SYSFS_PATH_MAX 256
 #define DNAME_PATH_MAX 256
 
-- 
cgit v1.2.3


From 5a69d33b3ed62f6c1eb7ad3b9c350d29f49513e8 Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Thu, 9 Dec 2021 14:06:08 -0700
Subject: selftests/landlock: remove ARRAY_SIZE define from common.h

ARRAY_SIZE is defined in several selftests. Remove definitions from
individual test files and include header file for the define instead.
ARRAY_SIZE define is added in a separate patch to prepare for this
change.

Remove ARRAY_SIZE from common.h and pickup the one defined in
kselftest.h.

Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/landlock/common.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/landlock/common.h b/tools/testing/selftests/landlock/common.h
index 20e2a9286d71..183b7e8e1b95 100644
--- a/tools/testing/selftests/landlock/common.h
+++ b/tools/testing/selftests/landlock/common.h
@@ -17,10 +17,6 @@
 
 #include "../kselftest_harness.h"
 
-#ifndef ARRAY_SIZE
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-#endif
-
 /*
  * TEST_F_FORK() is useful when a test drop privileges but the corresponding
  * FIXTURE_TEARDOWN() requires them (e.g. to remove files from a directory
-- 
cgit v1.2.3


From 1329e40ebd18f1171f89f9e61b7a4aed9a1de406 Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Thu, 9 Dec 2021 14:10:23 -0700
Subject: selftests/net: remove ARRAY_SIZE define from individual tests

ARRAY_SIZE is defined in several selftests. Remove definitions from
individual test files and include header file for the define instead.
ARRAY_SIZE define is added in a separate patch to prepare for this
change.

Remove ARRAY_SIZE from net tests and pickup the one defined in
kselftest.h.

Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/net/gro.c                     | 3 ++-
 tools/testing/selftests/net/ipsec.c                   | 1 -
 tools/testing/selftests/net/reuseport_bpf.c           | 4 +---
 tools/testing/selftests/net/rxtimestamp.c             | 2 +-
 tools/testing/selftests/net/socket.c                  | 3 ++-
 tools/testing/selftests/net/tcp_fastopen_backup_key.c | 6 ++----
 6 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/gro.c b/tools/testing/selftests/net/gro.c
index cf37ce86b0fd..221525ccbe1d 100644
--- a/tools/testing/selftests/net/gro.c
+++ b/tools/testing/selftests/net/gro.c
@@ -57,10 +57,11 @@
 #include <string.h>
 #include <unistd.h>
 
+#include "../kselftest.h"
+
 #define DPORT 8000
 #define SPORT 1500
 #define PAYLOAD_LEN 100
-#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
 #define NUM_PACKETS 4
 #define START_SEQ 100
 #define START_ACK 100
diff --git a/tools/testing/selftests/net/ipsec.c b/tools/testing/selftests/net/ipsec.c
index 3d7dde2c321b..cc10c10c5ed9 100644
--- a/tools/testing/selftests/net/ipsec.c
+++ b/tools/testing/selftests/net/ipsec.c
@@ -41,7 +41,6 @@
 
 #define pr_err(fmt, ...)	printk(fmt ": %m", ##__VA_ARGS__)
 
-#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
 #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
 
 #define IPV4_STR_SZ	16	/* xxx.xxx.xxx.xxx is longest + \0 */
diff --git a/tools/testing/selftests/net/reuseport_bpf.c b/tools/testing/selftests/net/reuseport_bpf.c
index b5277106df1f..072d709c96b4 100644
--- a/tools/testing/selftests/net/reuseport_bpf.c
+++ b/tools/testing/selftests/net/reuseport_bpf.c
@@ -24,9 +24,7 @@
 #include <sys/resource.h>
 #include <unistd.h>
 
-#ifndef ARRAY_SIZE
-#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
-#endif
+#include "../kselftest.h"
 
 struct test_params {
 	int recv_family;
diff --git a/tools/testing/selftests/net/rxtimestamp.c b/tools/testing/selftests/net/rxtimestamp.c
index e4613ce4ed69..9eb42570294d 100644
--- a/tools/testing/selftests/net/rxtimestamp.c
+++ b/tools/testing/selftests/net/rxtimestamp.c
@@ -18,7 +18,7 @@
 #include <linux/net_tstamp.h>
 #include <linux/errqueue.h>
 
-#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+#include "../kselftest.h"
 
 struct options {
 	int so_timestamp;
diff --git a/tools/testing/selftests/net/socket.c b/tools/testing/selftests/net/socket.c
index afca1ead677f..db1aeb8c5d1e 100644
--- a/tools/testing/selftests/net/socket.c
+++ b/tools/testing/selftests/net/socket.c
@@ -7,6 +7,8 @@
 #include <sys/socket.h>
 #include <netinet/in.h>
 
+#include "../kselftest.h"
+
 struct socket_testcase {
 	int	domain;
 	int	type;
@@ -31,7 +33,6 @@ static struct socket_testcase tests[] = {
 	{ AF_INET, SOCK_STREAM, IPPROTO_UDP, -EPROTONOSUPPORT, 1  },
 };
 
-#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
 #define ERR_STRING_SZ	64
 
 static int run_tests(void)
diff --git a/tools/testing/selftests/net/tcp_fastopen_backup_key.c b/tools/testing/selftests/net/tcp_fastopen_backup_key.c
index 9c55ec44fc43..c1cb0c75156a 100644
--- a/tools/testing/selftests/net/tcp_fastopen_backup_key.c
+++ b/tools/testing/selftests/net/tcp_fastopen_backup_key.c
@@ -26,6 +26,8 @@
 #include <fcntl.h>
 #include <time.h>
 
+#include "../kselftest.h"
+
 #ifndef TCP_FASTOPEN_KEY
 #define TCP_FASTOPEN_KEY 33
 #endif
@@ -34,10 +36,6 @@
 #define PROC_FASTOPEN_KEY "/proc/sys/net/ipv4/tcp_fastopen_key"
 #define KEY_LENGTH 16
 
-#ifndef ARRAY_SIZE
-#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
-#endif
-
 static bool do_ipv6;
 static bool do_sockopt;
 static bool do_rotate;
-- 
cgit v1.2.3


From 07ad4f7629d4802ff0d962b0ac23ea6445964e2a Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Thu, 9 Dec 2021 16:19:09 -0700
Subject: selftests/rseq: remove ARRAY_SIZE define from individual tests

ARRAY_SIZE is defined in several selftests. Remove definitions from
individual test files and include header file for the define instead.
ARRAY_SIZE define is added in a separate patch to prepare for this
change.

Remove ARRAY_SIZE from rseq tests and pickup the one defined in
kselftest.h.

Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/rseq/basic_percpu_ops_test.c | 3 +--
 tools/testing/selftests/rseq/rseq.c                  | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/rseq/basic_percpu_ops_test.c b/tools/testing/selftests/rseq/basic_percpu_ops_test.c
index eb3f6db36d36..b953a52ff706 100644
--- a/tools/testing/selftests/rseq/basic_percpu_ops_test.c
+++ b/tools/testing/selftests/rseq/basic_percpu_ops_test.c
@@ -9,10 +9,9 @@
 #include <string.h>
 #include <stddef.h>
 
+#include "../kselftest.h"
 #include "rseq.h"
 
-#define ARRAY_SIZE(arr)	(sizeof(arr) / sizeof((arr)[0]))
-
 struct percpu_lock_entry {
 	intptr_t v;
 } __attribute__((aligned(128)));
diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c
index 7159eb777fd3..fb440dfca158 100644
--- a/tools/testing/selftests/rseq/rseq.c
+++ b/tools/testing/selftests/rseq/rseq.c
@@ -27,10 +27,9 @@
 #include <signal.h>
 #include <limits.h>
 
+#include "../kselftest.h"
 #include "rseq.h"
 
-#define ARRAY_SIZE(arr)	(sizeof(arr) / sizeof((arr)[0]))
-
 __thread volatile struct rseq __rseq_abi = {
 	.cpu_id = RSEQ_CPU_ID_UNINITIALIZED,
 };
-- 
cgit v1.2.3


From 6e5eba2e336694b88e3ef6584a25e0723c173ae2 Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Thu, 9 Dec 2021 16:20:53 -0700
Subject: selftests/seccomp: remove ARRAY_SIZE define from seccomp_benchmark

ARRAY_SIZE is defined in several selftests. Remove definitions from
individual test files and include header file for the define instead.
ARRAY_SIZE define is added in a separate patch to prepare for this
change.

Remove ARRAY_SIZE from seccomp_benchmark and pickup the one defined in
kselftest.h.

Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/seccomp/seccomp_benchmark.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/seccomp/seccomp_benchmark.c b/tools/testing/selftests/seccomp/seccomp_benchmark.c
index 6e5102a7d7c9..5b5c9d558dee 100644
--- a/tools/testing/selftests/seccomp/seccomp_benchmark.c
+++ b/tools/testing/selftests/seccomp/seccomp_benchmark.c
@@ -18,7 +18,7 @@
 #include <sys/syscall.h>
 #include <sys/types.h>
 
-#define ARRAY_SIZE(a)    (sizeof(a) / sizeof(a[0]))
+#include "../kselftest.h"
 
 unsigned long long timing(clockid_t clk_id, unsigned long long samples)
 {
-- 
cgit v1.2.3


From 08ca3510f74839a695257392a58a6d7ec142eaed Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Thu, 9 Dec 2021 16:22:39 -0700
Subject: selftests/sparc64: remove ARRAY_SIZE define from adi-test

ARRAY_SIZE is defined in several selftests. Remove definition from
individual test files and include header file for the define instead.
ARRAY_SIZE define is added in a separate patch to prepare for this
change.

Remove ARRAY_SIZE from adi-test and pickup the one defined in
kselftest.h.

Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/sparc64/drivers/adi-test.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/sparc64/drivers/adi-test.c b/tools/testing/selftests/sparc64/drivers/adi-test.c
index 95d93c6a88a5..84e5d9fd20b0 100644
--- a/tools/testing/selftests/sparc64/drivers/adi-test.c
+++ b/tools/testing/selftests/sparc64/drivers/adi-test.c
@@ -24,10 +24,6 @@
 #define DEBUG_LEVEL_4_BIT	(0x0008)
 #define DEBUG_TIMING_BIT	(0x1000)
 
-#ifndef ARRAY_SIZE
-# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-#endif
-
 /* bit mask of enabled bits to print */
 #define DEBUG 0x0001
 
-- 
cgit v1.2.3


From 7527c03870fd7bfb21b13656c8b5dda30b6685a7 Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Thu, 9 Dec 2021 16:27:45 -0700
Subject: selftests/timens: remove ARRAY_SIZE define from individual tests

ARRAY_SIZE is defined in several selftests. Remove definitions from
individual test files and include header file for the define instead.
ARRAY_SIZE define is added in a separate patch to prepare for this
change.

Remove ARRAY_SIZE from timens tests and pickup the one defined in
kselftest.h.

Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/timens/procfs.c | 2 --
 tools/testing/selftests/timens/timens.c | 2 --
 2 files changed, 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/timens/procfs.c b/tools/testing/selftests/timens/procfs.c
index f2519154208a..1833ca97eb24 100644
--- a/tools/testing/selftests/timens/procfs.c
+++ b/tools/testing/selftests/timens/procfs.c
@@ -24,8 +24,6 @@
 #define DAY_IN_SEC			(60*60*24)
 #define TEN_DAYS_IN_SEC			(10*DAY_IN_SEC)
 
-#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
-
 static int child_ns, parent_ns;
 
 static int switch_ns(int fd)
diff --git a/tools/testing/selftests/timens/timens.c b/tools/testing/selftests/timens/timens.c
index 52b6a1185f52..387220791a05 100644
--- a/tools/testing/selftests/timens/timens.c
+++ b/tools/testing/selftests/timens/timens.c
@@ -22,8 +22,6 @@
 #define DAY_IN_SEC			(60*60*24)
 #define TEN_DAYS_IN_SEC			(10*DAY_IN_SEC)
 
-#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
-
 struct test_clock {
 	clockid_t id;
 	char *name;
-- 
cgit v1.2.3


From e89908201e2509354c40158b517945bf3d645812 Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Thu, 9 Dec 2021 16:35:02 -0700
Subject: selftests/vm: remove ARRAY_SIZE define from individual tests

ARRAY_SIZE is defined in several selftests. Remove definitions from
individual test files and include header file for the define instead.
ARRAY_SIZE define is added in a separate patch to prepare for this
change.

Remove ARRAY_SIZE from vm tests and pickup the one defined in
kselftest.h.

Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/vm/mremap_test.c    | 1 -
 tools/testing/selftests/vm/pkey-helpers.h   | 3 ++-
 tools/testing/selftests/vm/va_128TBswitch.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vm/mremap_test.c b/tools/testing/selftests/vm/mremap_test.c
index 0624d1bd71b5..7c0b0617b9f8 100644
--- a/tools/testing/selftests/vm/mremap_test.c
+++ b/tools/testing/selftests/vm/mremap_test.c
@@ -20,7 +20,6 @@
 #define VALIDATION_DEFAULT_THRESHOLD 4	/* 4MB */
 #define VALIDATION_NO_THRESHOLD 0	/* Verify the entire region */
 
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 #define MIN(X, Y) ((X) < (Y) ? (X) : (Y))
 
 struct config {
diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h
index 622a85848f61..92f3be3dd8e5 100644
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -13,6 +13,8 @@
 #include <ucontext.h>
 #include <sys/mman.h>
 
+#include "../kselftest.h"
+
 /* Define some kernel-like types */
 #define  u8 __u8
 #define u16 __u16
@@ -175,7 +177,6 @@ static inline void __pkey_write_allow(int pkey, int do_allow_write)
 	dprintf4("pkey_reg now: %016llx\n", read_pkey_reg());
 }
 
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
 #define ALIGN_UP(x, align_to)	(((x) + ((align_to)-1)) & ~((align_to)-1))
 #define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1))
 #define ALIGN_PTR_UP(p, ptr_align_to)	\
diff --git a/tools/testing/selftests/vm/va_128TBswitch.c b/tools/testing/selftests/vm/va_128TBswitch.c
index 83acdff26a13..da6ec3b53ea8 100644
--- a/tools/testing/selftests/vm/va_128TBswitch.c
+++ b/tools/testing/selftests/vm/va_128TBswitch.c
@@ -9,7 +9,7 @@
 #include <sys/mman.h>
 #include <string.h>
 
-#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+#include "../kselftest.h"
 
 #ifdef __powerpc64__
 #define PAGE_SIZE	(64 << 10)
-- 
cgit v1.2.3


From 964e17016cf99902c79a5de095cc5e57e7d58248 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 10 Dec 2021 14:46:46 -0800
Subject: selftests/damon: skip test if DAMON is running

Testing the DAMON debugfs files while DAMON is running makes no sense,
as any write to the debugfs files will fail.  This commit makes the test
be skipped in this case.

Link: https://lkml.kernel.org/r/20211201150440.1088-8-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/damon/debugfs_attrs.sh | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/damon/debugfs_attrs.sh b/tools/testing/selftests/damon/debugfs_attrs.sh
index 196b6640bf37..fc80380c59f0 100644
--- a/tools/testing/selftests/damon/debugfs_attrs.sh
+++ b/tools/testing/selftests/damon/debugfs_attrs.sh
@@ -44,6 +44,15 @@ test_content() {
 
 source ./_chk_dependency.sh
 
+ksft_skip=4
+
+damon_onoff="$DBGFS/monitor_on"
+if [ $(cat "$damon_onoff") = "on" ]
+then
+	echo "monitoring is on"
+	exit $ksft_skip
+fi
+
 # Test attrs file
 # ===============
 
-- 
cgit v1.2.3


From c6980e30af356e85699f37142ae435a6aa483ceb Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 10 Dec 2021 14:46:49 -0800
Subject: selftests/damon: test DAMON enabling with empty target_ids case

DAMON debugfs didn't check empty targets when starting monitoring, and
the issue is fixed with commit b5ca3e83ddb0 ("mm/damon/dbgfs: add
adaptive_targets list check before enable monitor_on").  To avoid future
regression, this commit adds a test case for that in DAMON selftests.

Link: https://lkml.kernel.org/r/20211201150440.1088-9-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/damon/debugfs_attrs.sh | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/damon/debugfs_attrs.sh b/tools/testing/selftests/damon/debugfs_attrs.sh
index fc80380c59f0..d0916373f310 100644
--- a/tools/testing/selftests/damon/debugfs_attrs.sh
+++ b/tools/testing/selftests/damon/debugfs_attrs.sh
@@ -94,4 +94,13 @@ test_write_succ "$file" "" "$orig_content" "empty input"
 test_content "$file" "$orig_content" "" "empty input written"
 echo "$orig_content" > "$file"
 
+# Test empty targets case
+# =======================
+
+orig_target_ids=$(cat "$DBGFS/target_ids")
+echo "" > "$DBGFS/target_ids"
+orig_monitor_on=$(cat "$DBGFS/monitor_on")
+test_write_fail "$DBGFS/monitor_on" "on" "orig_monitor_on" "empty target ids"
+echo "$orig_target_ids" > "$DBGFS/target_ids"
+
 echo "PASS"
-- 
cgit v1.2.3


From d85570c655cc2c257b7da37a6d1fa4c59443c055 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 10 Dec 2021 14:46:52 -0800
Subject: selftests/damon: test wrong DAMOS condition ranges input

A patch titled "mm/damon/schemes: add the validity judgment of
thresholds"[1] makes DAMON debugfs interface to validate DAMON scheme
inputs.  This commit adds a test case for the validation logic in DAMON
selftests.

[1] https://lore.kernel.org/linux-mm/d78360e52158d786fcbf20bc62c96785742e76d3.1637239568.git.xhao@linux.alibaba.com/

Link: https://lkml.kernel.org/r/20211201150440.1088-10-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/damon/debugfs_attrs.sh | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/damon/debugfs_attrs.sh b/tools/testing/selftests/damon/debugfs_attrs.sh
index d0916373f310..1ef118617167 100644
--- a/tools/testing/selftests/damon/debugfs_attrs.sh
+++ b/tools/testing/selftests/damon/debugfs_attrs.sh
@@ -77,6 +77,8 @@ test_write_succ "$file" "1 2 3 4 5 6 4 0 0 0 1 2 3 1 100 3 2 1" \
 test_write_fail "$file" "1 2
 3 4 5 6 3 0 0 0 1 2 3 1 100 3 2 1" "$orig_content" "multi lines"
 test_write_succ "$file" "" "$orig_content" "disabling"
+test_write_fail "$file" "2 1 2 1 10 1 3 10 1 1 1 1 1 1 1 1 2 3" \
+	"$orig_content" "wrong condition ranges"
 echo "$orig_content" > "$file"
 
 # Test target_ids file
-- 
cgit v1.2.3


From b4a002889d24979295ed3c2bf1d5fcfb3901026a Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 10 Dec 2021 14:46:55 -0800
Subject: selftests/damon: test debugfs file reads/writes with huge count

DAMON debugfs interface users were able to trigger warning by writing
some files with arbitrarily large 'count' parameter.  The issue is fixed
with commit db7a347b26fe ("mm/damon/dbgfs: use '__GFP_NOWARN' for
user-specified size buffer allocation").  This commit adds a test case
for the issue in DAMON selftests to avoid future regressions.

Link: https://lkml.kernel.org/r/20211201150440.1088-11-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/damon/.gitignore           |  2 ++
 tools/testing/selftests/damon/Makefile             |  2 ++
 tools/testing/selftests/damon/debugfs_attrs.sh     | 18 ++++++++++
 .../selftests/damon/huge_count_read_write.c        | 39 ++++++++++++++++++++++
 4 files changed, 61 insertions(+)
 create mode 100644 tools/testing/selftests/damon/.gitignore
 create mode 100644 tools/testing/selftests/damon/huge_count_read_write.c

(limited to 'tools')

diff --git a/tools/testing/selftests/damon/.gitignore b/tools/testing/selftests/damon/.gitignore
new file mode 100644
index 000000000000..c6c2965a6607
--- /dev/null
+++ b/tools/testing/selftests/damon/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+huge_count_read_write
diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile
index 8a3f2cd9fec0..f0aa954b5d13 100644
--- a/tools/testing/selftests/damon/Makefile
+++ b/tools/testing/selftests/damon/Makefile
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0
 # Makefile for damon selftests
 
+TEST_GEN_FILES += huge_count_read_write
+
 TEST_FILES = _chk_dependency.sh
 TEST_PROGS = debugfs_attrs.sh
 
diff --git a/tools/testing/selftests/damon/debugfs_attrs.sh b/tools/testing/selftests/damon/debugfs_attrs.sh
index 1ef118617167..23a7b48ca7d3 100644
--- a/tools/testing/selftests/damon/debugfs_attrs.sh
+++ b/tools/testing/selftests/damon/debugfs_attrs.sh
@@ -105,4 +105,22 @@ orig_monitor_on=$(cat "$DBGFS/monitor_on")
 test_write_fail "$DBGFS/monitor_on" "on" "orig_monitor_on" "empty target ids"
 echo "$orig_target_ids" > "$DBGFS/target_ids"
 
+# Test huge count read write
+# ==========================
+
+dmesg -C
+
+for file in "$DBGFS/"*
+do
+	./huge_count_read_write "$file"
+done
+
+if dmesg | grep -q WARNING
+then
+	dmesg
+	exit 1
+else
+	exit 0
+fi
+
 echo "PASS"
diff --git a/tools/testing/selftests/damon/huge_count_read_write.c b/tools/testing/selftests/damon/huge_count_read_write.c
new file mode 100644
index 000000000000..ad7a6b4cf338
--- /dev/null
+++ b/tools/testing/selftests/damon/huge_count_read_write.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdio.h>
+
+void write_read_with_huge_count(char *file)
+{
+	int filedesc = open(file, O_RDWR);
+	char buf[25];
+	int ret;
+
+	printf("%s %s\n", __func__, file);
+	if (filedesc < 0) {
+		fprintf(stderr, "failed opening %s\n", file);
+		exit(1);
+	}
+
+	write(filedesc, "", 0xfffffffful);
+	perror("after write: ");
+	ret = read(filedesc, buf, 0xfffffffful);
+	perror("after read: ");
+	close(filedesc);
+}
+
+int main(int argc, char *argv[])
+{
+	if (argc != 2) {
+		fprintf(stderr, "Usage: %s <file>\n", argv[0]);
+		exit(1);
+	}
+	write_read_with_huge_count(argv[1]);
+
+	return 0;
+}
-- 
cgit v1.2.3


From 9ab3b0c8ef629f60ef25cb7634ad305315ae94d1 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 10 Dec 2021 14:46:59 -0800
Subject: selftests/damon: split test cases

Currently, the single test program, debugfs.sh, contains all test cases
for DAMON.  When one of the cases fails, finding which case is failed
from the test log is not so easy, and all remaining tests will be
skipped.  To improve the situation, this commit splits the single
program into small test programs having their own names.

Link: https://lkml.kernel.org/r/20211201150440.1088-12-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/damon/Makefile             |   5 +-
 tools/testing/selftests/damon/_debugfs_common.sh   |  52 ++++++++++
 tools/testing/selftests/damon/debugfs_attrs.sh     | 111 +--------------------
 .../selftests/damon/debugfs_empty_targets.sh       |  13 +++
 .../damon/debugfs_huge_count_read_write.sh         |  22 ++++
 tools/testing/selftests/damon/debugfs_schemes.sh   |  19 ++++
 .../testing/selftests/damon/debugfs_target_ids.sh  |  19 ++++
 7 files changed, 129 insertions(+), 112 deletions(-)
 create mode 100644 tools/testing/selftests/damon/_debugfs_common.sh
 create mode 100644 tools/testing/selftests/damon/debugfs_empty_targets.sh
 create mode 100644 tools/testing/selftests/damon/debugfs_huge_count_read_write.sh
 create mode 100644 tools/testing/selftests/damon/debugfs_schemes.sh
 create mode 100644 tools/testing/selftests/damon/debugfs_target_ids.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile
index f0aa954b5d13..937d36ae9a69 100644
--- a/tools/testing/selftests/damon/Makefile
+++ b/tools/testing/selftests/damon/Makefile
@@ -3,7 +3,8 @@
 
 TEST_GEN_FILES += huge_count_read_write
 
-TEST_FILES = _chk_dependency.sh
-TEST_PROGS = debugfs_attrs.sh
+TEST_FILES = _chk_dependency.sh _debugfs_common.sh
+TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh
+TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh
 
 include ../lib.mk
diff --git a/tools/testing/selftests/damon/_debugfs_common.sh b/tools/testing/selftests/damon/_debugfs_common.sh
new file mode 100644
index 000000000000..48989d4813ae
--- /dev/null
+++ b/tools/testing/selftests/damon/_debugfs_common.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+test_write_result() {
+	file=$1
+	content=$2
+	orig_content=$3
+	expect_reason=$4
+	expected=$5
+
+	echo "$content" > "$file"
+	if [ $? -ne "$expected" ]
+	then
+		echo "writing $content to $file doesn't return $expected"
+		echo "expected because: $expect_reason"
+		echo "$orig_content" > "$file"
+		exit 1
+	fi
+}
+
+test_write_succ() {
+	test_write_result "$1" "$2" "$3" "$4" 0
+}
+
+test_write_fail() {
+	test_write_result "$1" "$2" "$3" "$4" 1
+}
+
+test_content() {
+	file=$1
+	orig_content=$2
+	expected=$3
+	expect_reason=$4
+
+	content=$(cat "$file")
+	if [ "$content" != "$expected" ]
+	then
+		echo "reading $file expected $expected but $content"
+		echo "expected because: $expect_reason"
+		echo "$orig_content" > "$file"
+		exit 1
+	fi
+}
+
+source ./_chk_dependency.sh
+
+damon_onoff="$DBGFS/monitor_on"
+if [ $(cat "$damon_onoff") = "on" ]
+then
+	echo "monitoring is on"
+	exit $ksft_skip
+fi
diff --git a/tools/testing/selftests/damon/debugfs_attrs.sh b/tools/testing/selftests/damon/debugfs_attrs.sh
index 23a7b48ca7d3..902e312bca89 100644
--- a/tools/testing/selftests/damon/debugfs_attrs.sh
+++ b/tools/testing/selftests/damon/debugfs_attrs.sh
@@ -1,57 +1,7 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
-test_write_result() {
-	file=$1
-	content=$2
-	orig_content=$3
-	expect_reason=$4
-	expected=$5
-
-	echo "$content" > "$file"
-	if [ $? -ne "$expected" ]
-	then
-		echo "writing $content to $file doesn't return $expected"
-		echo "expected because: $expect_reason"
-		echo "$orig_content" > "$file"
-		exit 1
-	fi
-}
-
-test_write_succ() {
-	test_write_result "$1" "$2" "$3" "$4" 0
-}
-
-test_write_fail() {
-	test_write_result "$1" "$2" "$3" "$4" 1
-}
-
-test_content() {
-	file=$1
-	orig_content=$2
-	expected=$3
-	expect_reason=$4
-
-	content=$(cat "$file")
-	if [ "$content" != "$expected" ]
-	then
-		echo "reading $file expected $expected but $content"
-		echo "expected because: $expect_reason"
-		echo "$orig_content" > "$file"
-		exit 1
-	fi
-}
-
-source ./_chk_dependency.sh
-
-ksft_skip=4
-
-damon_onoff="$DBGFS/monitor_on"
-if [ $(cat "$damon_onoff") = "on" ]
-then
-	echo "monitoring is on"
-	exit $ksft_skip
-fi
+source _debugfs_common.sh
 
 # Test attrs file
 # ===============
@@ -65,62 +15,3 @@ test_write_fail "$file" "1 2 3 5 4" "$orig_content" \
 	"min_nr_regions > max_nr_regions"
 test_content "$file" "$orig_content" "1 2 3 4 5" "successfully written"
 echo "$orig_content" > "$file"
-
-# Test schemes file
-# =================
-
-file="$DBGFS/schemes"
-orig_content=$(cat "$file")
-
-test_write_succ "$file" "1 2 3 4 5 6 4 0 0 0 1 2 3 1 100 3 2 1" \
-	"$orig_content" "valid input"
-test_write_fail "$file" "1 2
-3 4 5 6 3 0 0 0 1 2 3 1 100 3 2 1" "$orig_content" "multi lines"
-test_write_succ "$file" "" "$orig_content" "disabling"
-test_write_fail "$file" "2 1 2 1 10 1 3 10 1 1 1 1 1 1 1 1 2 3" \
-	"$orig_content" "wrong condition ranges"
-echo "$orig_content" > "$file"
-
-# Test target_ids file
-# ====================
-
-file="$DBGFS/target_ids"
-orig_content=$(cat "$file")
-
-test_write_succ "$file" "1 2 3 4" "$orig_content" "valid input"
-test_write_succ "$file" "1 2 abc 4" "$orig_content" "still valid input"
-test_content "$file" "$orig_content" "1 2" "non-integer was there"
-test_write_succ "$file" "abc 2 3" "$orig_content" "the file allows wrong input"
-test_content "$file" "$orig_content" "" "wrong input written"
-test_write_succ "$file" "" "$orig_content" "empty input"
-test_content "$file" "$orig_content" "" "empty input written"
-echo "$orig_content" > "$file"
-
-# Test empty targets case
-# =======================
-
-orig_target_ids=$(cat "$DBGFS/target_ids")
-echo "" > "$DBGFS/target_ids"
-orig_monitor_on=$(cat "$DBGFS/monitor_on")
-test_write_fail "$DBGFS/monitor_on" "on" "orig_monitor_on" "empty target ids"
-echo "$orig_target_ids" > "$DBGFS/target_ids"
-
-# Test huge count read write
-# ==========================
-
-dmesg -C
-
-for file in "$DBGFS/"*
-do
-	./huge_count_read_write "$file"
-done
-
-if dmesg | grep -q WARNING
-then
-	dmesg
-	exit 1
-else
-	exit 0
-fi
-
-echo "PASS"
diff --git a/tools/testing/selftests/damon/debugfs_empty_targets.sh b/tools/testing/selftests/damon/debugfs_empty_targets.sh
new file mode 100644
index 000000000000..87aff8083822
--- /dev/null
+++ b/tools/testing/selftests/damon/debugfs_empty_targets.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source _debugfs_common.sh
+
+# Test empty targets case
+# =======================
+
+orig_target_ids=$(cat "$DBGFS/target_ids")
+echo "" > "$DBGFS/target_ids"
+orig_monitor_on=$(cat "$DBGFS/monitor_on")
+test_write_fail "$DBGFS/monitor_on" "on" "orig_monitor_on" "empty target ids"
+echo "$orig_target_ids" > "$DBGFS/target_ids"
diff --git a/tools/testing/selftests/damon/debugfs_huge_count_read_write.sh b/tools/testing/selftests/damon/debugfs_huge_count_read_write.sh
new file mode 100644
index 000000000000..922cadac2950
--- /dev/null
+++ b/tools/testing/selftests/damon/debugfs_huge_count_read_write.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source _debugfs_common.sh
+
+# Test huge count read write
+# ==========================
+
+dmesg -C
+
+for file in "$DBGFS/"*
+do
+	./huge_count_read_write "$file"
+done
+
+if dmesg | grep -q WARNING
+then
+	dmesg
+	exit 1
+else
+	exit 0
+fi
diff --git a/tools/testing/selftests/damon/debugfs_schemes.sh b/tools/testing/selftests/damon/debugfs_schemes.sh
new file mode 100644
index 000000000000..5b39ab44731c
--- /dev/null
+++ b/tools/testing/selftests/damon/debugfs_schemes.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source _debugfs_common.sh
+
+# Test schemes file
+# =================
+
+file="$DBGFS/schemes"
+orig_content=$(cat "$file")
+
+test_write_succ "$file" "1 2 3 4 5 6 4 0 0 0 1 2 3 1 100 3 2 1" \
+	"$orig_content" "valid input"
+test_write_fail "$file" "1 2
+3 4 5 6 3 0 0 0 1 2 3 1 100 3 2 1" "$orig_content" "multi lines"
+test_write_succ "$file" "" "$orig_content" "disabling"
+test_write_fail "$file" "2 1 2 1 10 1 3 10 1 1 1 1 1 1 1 1 2 3" \
+	"$orig_content" "wrong condition ranges"
+echo "$orig_content" > "$file"
diff --git a/tools/testing/selftests/damon/debugfs_target_ids.sh b/tools/testing/selftests/damon/debugfs_target_ids.sh
new file mode 100644
index 000000000000..49aeabdb0aae
--- /dev/null
+++ b/tools/testing/selftests/damon/debugfs_target_ids.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source _debugfs_common.sh
+
+# Test target_ids file
+# ====================
+
+file="$DBGFS/target_ids"
+orig_content=$(cat "$file")
+
+test_write_succ "$file" "1 2 3 4" "$orig_content" "valid input"
+test_write_succ "$file" "1 2 abc 4" "$orig_content" "still valid input"
+test_content "$file" "$orig_content" "1 2" "non-integer was there"
+test_write_succ "$file" "abc 2 3" "$orig_content" "the file allows wrong input"
+test_content "$file" "$orig_content" "" "wrong input written"
+test_write_succ "$file" "" "$orig_content" "empty input"
+test_content "$file" "$orig_content" "" "empty input written"
+echo "$orig_content" > "$file"
-- 
cgit v1.2.3


From c6dbd3e5e69cf3ca47a3864115d4cbdd44619243 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 15 Nov 2021 17:46:39 +0100
Subject: x86/mmx_32: Remove X86_USE_3DNOW

This code puts an exception table entry on the PREFETCH instruction to
overwrite it with a JMP.d8 when it triggers an exception. Except of
course, our code is no longer writable, also SMP.

Instead of fixing this broken mess, simply take it out.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/YZKQzUmeNuwyvZpk@hirez.programming.kicks-ass.net
---
 arch/x86/Kconfig                               |   2 +-
 arch/x86/Kconfig.cpu                           |   4 -
 arch/x86/include/asm/mmx.h                     |  15 -
 arch/x86/include/asm/page_32.h                 |  14 -
 arch/x86/include/asm/required-features.h       |   4 -
 arch/x86/include/asm/string_32.h               |  33 ---
 arch/x86/lib/Makefile                          |   1 -
 arch/x86/lib/memcpy_32.c                       |   4 -
 arch/x86/lib/mmx_32.c                          | 388 -------------------------
 arch/x86/lib/usercopy_32.c                     |   1 -
 tools/arch/x86/include/asm/required-features.h |   4 -
 11 files changed, 1 insertion(+), 469 deletions(-)

(limited to 'tools')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index dd13ba82ce7a..0c9c680fd15a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1957,7 +1957,7 @@ config EFI
 
 config EFI_STUB
 	bool "EFI stub support"
-	depends on EFI && !X86_USE_3DNOW
+	depends on EFI
 	depends on $(cc-option,-mabi=ms) || X86_32
 	select RELOCATABLE
 	help
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index eefc434351db..542377cd419d 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -342,10 +342,6 @@ config X86_USE_PPRO_CHECKSUM
 	def_bool y
 	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM
 
-config X86_USE_3DNOW
-	def_bool y
-	depends on (MCYRIXIII || MK7 || MGEODE_LX) && !UML
-
 #
 # P6_NOPs are a relatively minor optimization that require a family >=
 # 6 processor, except that it is broken on certain VIA chips.
diff --git a/arch/x86/include/asm/mmx.h b/arch/x86/include/asm/mmx.h
index f572d0f944bb..e69de29bb2d1 100644
--- a/arch/x86/include/asm/mmx.h
+++ b/arch/x86/include/asm/mmx.h
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_MMX_H
-#define _ASM_X86_MMX_H
-
-/*
- *	MMX 3Dnow! helper operations
- */
-
-#include <linux/types.h>
-
-extern void *_mmx_memcpy(void *to, const void *from, size_t size);
-extern void mmx_clear_page(void *page);
-extern void mmx_copy_page(void *to, void *from);
-
-#endif /* _ASM_X86_MMX_H */
diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h
index b13f8488ac85..df42f8aa99e4 100644
--- a/arch/x86/include/asm/page_32.h
+++ b/arch/x86/include/asm/page_32.h
@@ -19,19 +19,6 @@ extern unsigned long __phys_addr(unsigned long);
 #define pfn_valid(pfn)		((pfn) < max_mapnr)
 #endif /* CONFIG_FLATMEM */
 
-#ifdef CONFIG_X86_USE_3DNOW
-#include <asm/mmx.h>
-
-static inline void clear_page(void *page)
-{
-	mmx_clear_page(page);
-}
-
-static inline void copy_page(void *to, void *from)
-{
-	mmx_copy_page(to, from);
-}
-#else  /* !CONFIG_X86_USE_3DNOW */
 #include <linux/string.h>
 
 static inline void clear_page(void *page)
@@ -43,7 +30,6 @@ static inline void copy_page(void *to, void *from)
 {
 	memcpy(to, from, PAGE_SIZE);
 }
-#endif	/* CONFIG_X86_USE_3DNOW */
 #endif	/* !__ASSEMBLY__ */
 
 #endif /* _ASM_X86_PAGE_32_H */
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index b2d504f11937..aff774775c67 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -35,11 +35,7 @@
 # define NEED_CMOV	0
 #endif
 
-#ifdef CONFIG_X86_USE_3DNOW
-# define NEED_3DNOW	(1<<(X86_FEATURE_3DNOW & 31))
-#else
 # define NEED_3DNOW	0
-#endif
 
 #if defined(CONFIG_X86_P6_NOP) || defined(CONFIG_X86_64)
 # define NEED_NOPL	(1<<(X86_FEATURE_NOPL & 31))
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index f74362b05619..32c0d981a82a 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -146,42 +146,9 @@ static __always_inline void *__constant_memcpy(void *to, const void *from,
 extern void *memcpy(void *, const void *, size_t);
 
 #ifndef CONFIG_FORTIFY_SOURCE
-#ifdef CONFIG_X86_USE_3DNOW
-
-#include <asm/mmx.h>
-
-/*
- *	This CPU favours 3DNow strongly (eg AMD Athlon)
- */
-
-static inline void *__constant_memcpy3d(void *to, const void *from, size_t len)
-{
-	if (len < 512)
-		return __constant_memcpy(to, from, len);
-	return _mmx_memcpy(to, from, len);
-}
-
-static inline void *__memcpy3d(void *to, const void *from, size_t len)
-{
-	if (len < 512)
-		return __memcpy(to, from, len);
-	return _mmx_memcpy(to, from, len);
-}
-
-#define memcpy(t, f, n)				\
-	(__builtin_constant_p((n))		\
-	 ? __constant_memcpy3d((t), (f), (n))	\
-	 : __memcpy3d((t), (f), (n)))
-
-#else
-
-/*
- *	No 3D Now!
- */
 
 #define memcpy(t, f, n) __builtin_memcpy(t, f, n)
 
-#endif
 #endif /* !CONFIG_FORTIFY_SOURCE */
 
 #define __HAVE_ARCH_MEMMOVE
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index c6506c6a7092..f76747862bd2 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -63,7 +63,6 @@ ifeq ($(CONFIG_X86_32),y)
 ifneq ($(CONFIG_X86_CMPXCHG64),y)
         lib-y += cmpxchg8b_emu.o atomic64_386_32.o
 endif
-        lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
 else
         obj-y += iomap_copy_64.o
         lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c
index e565d1c9019e..3a6e6cfe8c35 100644
--- a/arch/x86/lib/memcpy_32.c
+++ b/arch/x86/lib/memcpy_32.c
@@ -7,11 +7,7 @@
 
 __visible void *memcpy(void *to, const void *from, size_t n)
 {
-#if defined(CONFIG_X86_USE_3DNOW) && !defined(CONFIG_FORTIFY_SOURCE)
-	return __memcpy3d(to, from, n);
-#else
 	return __memcpy(to, from, n);
-#endif
 }
 EXPORT_SYMBOL(memcpy);
 
diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c
index cc5f4ea943d3..e69de29bb2d1 100644
--- a/arch/x86/lib/mmx_32.c
+++ b/arch/x86/lib/mmx_32.c
@@ -1,388 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- *	MMX 3DNow! library helper functions
- *
- *	To do:
- *	We can use MMX just for prefetch in IRQ's. This may be a win.
- *		(reported so on K6-III)
- *	We should use a better code neutral filler for the short jump
- *		leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
- *	We also want to clobber the filler register so we don't get any
- *		register forwarding stalls on the filler.
- *
- *	Add *user handling. Checksums are not a win with MMX on any CPU
- *	tested so far for any MMX solution figured.
- *
- *	22/09/2000 - Arjan van de Ven
- *		Improved for non-engineering-sample Athlons
- *
- */
-#include <linux/hardirq.h>
-#include <linux/string.h>
-#include <linux/export.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-
-#include <asm/fpu/api.h>
-#include <asm/asm.h>
-
-/*
- * Use KFPU_387.  MMX instructions are not affected by MXCSR,
- * but both AMD and Intel documentation states that even integer MMX
- * operations will result in #MF if an exception is pending in FCW.
- *
- * EMMS is not needed afterwards because, after calling kernel_fpu_end(),
- * any subsequent user of the 387 stack will reinitialize it using
- * KFPU_387.
- */
-
-void *_mmx_memcpy(void *to, const void *from, size_t len)
-{
-	void *p;
-	int i;
-
-	if (unlikely(in_interrupt()))
-		return __memcpy(to, from, len);
-
-	p = to;
-	i = len >> 6; /* len/64 */
-
-	kernel_fpu_begin_mask(KFPU_387);
-
-	__asm__ __volatile__ (
-		"1: prefetch (%0)\n"		/* This set is 28 bytes */
-		"   prefetch 64(%0)\n"
-		"   prefetch 128(%0)\n"
-		"   prefetch 192(%0)\n"
-		"   prefetch 256(%0)\n"
-		"2:  \n"
-		".section .fixup, \"ax\"\n"
-		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
-		"   jmp 2b\n"
-		".previous\n"
-			_ASM_EXTABLE(1b, 3b)
-			: : "r" (from));
-
-	for ( ; i > 5; i--) {
-		__asm__ __volatile__ (
-		"1:  prefetch 320(%0)\n"
-		"2:  movq (%0), %%mm0\n"
-		"  movq 8(%0), %%mm1\n"
-		"  movq 16(%0), %%mm2\n"
-		"  movq 24(%0), %%mm3\n"
-		"  movq %%mm0, (%1)\n"
-		"  movq %%mm1, 8(%1)\n"
-		"  movq %%mm2, 16(%1)\n"
-		"  movq %%mm3, 24(%1)\n"
-		"  movq 32(%0), %%mm0\n"
-		"  movq 40(%0), %%mm1\n"
-		"  movq 48(%0), %%mm2\n"
-		"  movq 56(%0), %%mm3\n"
-		"  movq %%mm0, 32(%1)\n"
-		"  movq %%mm1, 40(%1)\n"
-		"  movq %%mm2, 48(%1)\n"
-		"  movq %%mm3, 56(%1)\n"
-		".section .fixup, \"ax\"\n"
-		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
-		"   jmp 2b\n"
-		".previous\n"
-			_ASM_EXTABLE(1b, 3b)
-			: : "r" (from), "r" (to) : "memory");
-
-		from += 64;
-		to += 64;
-	}
-
-	for ( ; i > 0; i--) {
-		__asm__ __volatile__ (
-		"  movq (%0), %%mm0\n"
-		"  movq 8(%0), %%mm1\n"
-		"  movq 16(%0), %%mm2\n"
-		"  movq 24(%0), %%mm3\n"
-		"  movq %%mm0, (%1)\n"
-		"  movq %%mm1, 8(%1)\n"
-		"  movq %%mm2, 16(%1)\n"
-		"  movq %%mm3, 24(%1)\n"
-		"  movq 32(%0), %%mm0\n"
-		"  movq 40(%0), %%mm1\n"
-		"  movq 48(%0), %%mm2\n"
-		"  movq 56(%0), %%mm3\n"
-		"  movq %%mm0, 32(%1)\n"
-		"  movq %%mm1, 40(%1)\n"
-		"  movq %%mm2, 48(%1)\n"
-		"  movq %%mm3, 56(%1)\n"
-			: : "r" (from), "r" (to) : "memory");
-
-		from += 64;
-		to += 64;
-	}
-	/*
-	 * Now do the tail of the block:
-	 */
-	__memcpy(to, from, len & 63);
-	kernel_fpu_end();
-
-	return p;
-}
-EXPORT_SYMBOL(_mmx_memcpy);
-
-#ifdef CONFIG_MK7
-
-/*
- *	The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
- *	other MMX using processors do not.
- */
-
-static void fast_clear_page(void *page)
-{
-	int i;
-
-	kernel_fpu_begin_mask(KFPU_387);
-
-	__asm__ __volatile__ (
-		"  pxor %%mm0, %%mm0\n" : :
-	);
-
-	for (i = 0; i < 4096/64; i++) {
-		__asm__ __volatile__ (
-		"  movntq %%mm0, (%0)\n"
-		"  movntq %%mm0, 8(%0)\n"
-		"  movntq %%mm0, 16(%0)\n"
-		"  movntq %%mm0, 24(%0)\n"
-		"  movntq %%mm0, 32(%0)\n"
-		"  movntq %%mm0, 40(%0)\n"
-		"  movntq %%mm0, 48(%0)\n"
-		"  movntq %%mm0, 56(%0)\n"
-		: : "r" (page) : "memory");
-		page += 64;
-	}
-
-	/*
-	 * Since movntq is weakly-ordered, a "sfence" is needed to become
-	 * ordered again:
-	 */
-	__asm__ __volatile__("sfence\n"::);
-
-	kernel_fpu_end();
-}
-
-static void fast_copy_page(void *to, void *from)
-{
-	int i;
-
-	kernel_fpu_begin_mask(KFPU_387);
-
-	/*
-	 * maybe the prefetch stuff can go before the expensive fnsave...
-	 * but that is for later. -AV
-	 */
-	__asm__ __volatile__(
-		"1: prefetch (%0)\n"
-		"   prefetch 64(%0)\n"
-		"   prefetch 128(%0)\n"
-		"   prefetch 192(%0)\n"
-		"   prefetch 256(%0)\n"
-		"2:  \n"
-		".section .fixup, \"ax\"\n"
-		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
-		"   jmp 2b\n"
-		".previous\n"
-			_ASM_EXTABLE(1b, 3b) : : "r" (from));
-
-	for (i = 0; i < (4096-320)/64; i++) {
-		__asm__ __volatile__ (
-		"1: prefetch 320(%0)\n"
-		"2: movq (%0), %%mm0\n"
-		"   movntq %%mm0, (%1)\n"
-		"   movq 8(%0), %%mm1\n"
-		"   movntq %%mm1, 8(%1)\n"
-		"   movq 16(%0), %%mm2\n"
-		"   movntq %%mm2, 16(%1)\n"
-		"   movq 24(%0), %%mm3\n"
-		"   movntq %%mm3, 24(%1)\n"
-		"   movq 32(%0), %%mm4\n"
-		"   movntq %%mm4, 32(%1)\n"
-		"   movq 40(%0), %%mm5\n"
-		"   movntq %%mm5, 40(%1)\n"
-		"   movq 48(%0), %%mm6\n"
-		"   movntq %%mm6, 48(%1)\n"
-		"   movq 56(%0), %%mm7\n"
-		"   movntq %%mm7, 56(%1)\n"
-		".section .fixup, \"ax\"\n"
-		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
-		"   jmp 2b\n"
-		".previous\n"
-		_ASM_EXTABLE(1b, 3b) : : "r" (from), "r" (to) : "memory");
-
-		from += 64;
-		to += 64;
-	}
-
-	for (i = (4096-320)/64; i < 4096/64; i++) {
-		__asm__ __volatile__ (
-		"2: movq (%0), %%mm0\n"
-		"   movntq %%mm0, (%1)\n"
-		"   movq 8(%0), %%mm1\n"
-		"   movntq %%mm1, 8(%1)\n"
-		"   movq 16(%0), %%mm2\n"
-		"   movntq %%mm2, 16(%1)\n"
-		"   movq 24(%0), %%mm3\n"
-		"   movntq %%mm3, 24(%1)\n"
-		"   movq 32(%0), %%mm4\n"
-		"   movntq %%mm4, 32(%1)\n"
-		"   movq 40(%0), %%mm5\n"
-		"   movntq %%mm5, 40(%1)\n"
-		"   movq 48(%0), %%mm6\n"
-		"   movntq %%mm6, 48(%1)\n"
-		"   movq 56(%0), %%mm7\n"
-		"   movntq %%mm7, 56(%1)\n"
-			: : "r" (from), "r" (to) : "memory");
-		from += 64;
-		to += 64;
-	}
-	/*
-	 * Since movntq is weakly-ordered, a "sfence" is needed to become
-	 * ordered again:
-	 */
-	__asm__ __volatile__("sfence \n"::);
-	kernel_fpu_end();
-}
-
-#else /* CONFIG_MK7 */
-
-/*
- *	Generic MMX implementation without K7 specific streaming
- */
-static void fast_clear_page(void *page)
-{
-	int i;
-
-	kernel_fpu_begin_mask(KFPU_387);
-
-	__asm__ __volatile__ (
-		"  pxor %%mm0, %%mm0\n" : :
-	);
-
-	for (i = 0; i < 4096/128; i++) {
-		__asm__ __volatile__ (
-		"  movq %%mm0, (%0)\n"
-		"  movq %%mm0, 8(%0)\n"
-		"  movq %%mm0, 16(%0)\n"
-		"  movq %%mm0, 24(%0)\n"
-		"  movq %%mm0, 32(%0)\n"
-		"  movq %%mm0, 40(%0)\n"
-		"  movq %%mm0, 48(%0)\n"
-		"  movq %%mm0, 56(%0)\n"
-		"  movq %%mm0, 64(%0)\n"
-		"  movq %%mm0, 72(%0)\n"
-		"  movq %%mm0, 80(%0)\n"
-		"  movq %%mm0, 88(%0)\n"
-		"  movq %%mm0, 96(%0)\n"
-		"  movq %%mm0, 104(%0)\n"
-		"  movq %%mm0, 112(%0)\n"
-		"  movq %%mm0, 120(%0)\n"
-			: : "r" (page) : "memory");
-		page += 128;
-	}
-
-	kernel_fpu_end();
-}
-
-static void fast_copy_page(void *to, void *from)
-{
-	int i;
-
-	kernel_fpu_begin_mask(KFPU_387);
-
-	__asm__ __volatile__ (
-		"1: prefetch (%0)\n"
-		"   prefetch 64(%0)\n"
-		"   prefetch 128(%0)\n"
-		"   prefetch 192(%0)\n"
-		"   prefetch 256(%0)\n"
-		"2:  \n"
-		".section .fixup, \"ax\"\n"
-		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
-		"   jmp 2b\n"
-		".previous\n"
-			_ASM_EXTABLE(1b, 3b) : : "r" (from));
-
-	for (i = 0; i < 4096/64; i++) {
-		__asm__ __volatile__ (
-		"1: prefetch 320(%0)\n"
-		"2: movq (%0), %%mm0\n"
-		"   movq 8(%0), %%mm1\n"
-		"   movq 16(%0), %%mm2\n"
-		"   movq 24(%0), %%mm3\n"
-		"   movq %%mm0, (%1)\n"
-		"   movq %%mm1, 8(%1)\n"
-		"   movq %%mm2, 16(%1)\n"
-		"   movq %%mm3, 24(%1)\n"
-		"   movq 32(%0), %%mm0\n"
-		"   movq 40(%0), %%mm1\n"
-		"   movq 48(%0), %%mm2\n"
-		"   movq 56(%0), %%mm3\n"
-		"   movq %%mm0, 32(%1)\n"
-		"   movq %%mm1, 40(%1)\n"
-		"   movq %%mm2, 48(%1)\n"
-		"   movq %%mm3, 56(%1)\n"
-		".section .fixup, \"ax\"\n"
-		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
-		"   jmp 2b\n"
-		".previous\n"
-			_ASM_EXTABLE(1b, 3b)
-			: : "r" (from), "r" (to) : "memory");
-
-		from += 64;
-		to += 64;
-	}
-	kernel_fpu_end();
-}
-
-#endif /* !CONFIG_MK7 */
-
-/*
- * Favour MMX for page clear and copy:
- */
-static void slow_zero_page(void *page)
-{
-	int d0, d1;
-
-	__asm__ __volatile__(
-		"cld\n\t"
-		"rep ; stosl"
-
-			: "=&c" (d0), "=&D" (d1)
-			:"a" (0), "1" (page), "0" (1024)
-			:"memory");
-}
-
-void mmx_clear_page(void *page)
-{
-	if (unlikely(in_interrupt()))
-		slow_zero_page(page);
-	else
-		fast_clear_page(page);
-}
-EXPORT_SYMBOL(mmx_clear_page);
-
-static void slow_copy_page(void *to, void *from)
-{
-	int d0, d1, d2;
-
-	__asm__ __volatile__(
-		"cld\n\t"
-		"rep ; movsl"
-		: "=&c" (d0), "=&D" (d1), "=&S" (d2)
-		: "0" (1024), "1" ((long) to), "2" ((long) from)
-		: "memory");
-}
-
-void mmx_copy_page(void *to, void *from)
-{
-	if (unlikely(in_interrupt()))
-		slow_copy_page(to, from);
-	else
-		fast_copy_page(to, from);
-}
-EXPORT_SYMBOL(mmx_copy_page);
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 7d290777246d..962006bdb8a8 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -8,7 +8,6 @@
  */
 #include <linux/export.h>
 #include <linux/uaccess.h>
-#include <asm/mmx.h>
 #include <asm/asm.h>
 
 #ifdef CONFIG_X86_INTEL_USERCOPY
diff --git a/tools/arch/x86/include/asm/required-features.h b/tools/arch/x86/include/asm/required-features.h
index b2d504f11937..aff774775c67 100644
--- a/tools/arch/x86/include/asm/required-features.h
+++ b/tools/arch/x86/include/asm/required-features.h
@@ -35,11 +35,7 @@
 # define NEED_CMOV	0
 #endif
 
-#ifdef CONFIG_X86_USE_3DNOW
-# define NEED_3DNOW	(1<<(X86_FEATURE_3DNOW & 31))
-#else
 # define NEED_3DNOW	0
-#endif
 
 #if defined(CONFIG_X86_P6_NOP) || defined(CONFIG_X86_64)
 # define NEED_NOPL	(1<<(X86_FEATURE_NOPL & 31))
-- 
cgit v1.2.3


From 82a8954acd93ae95d6252fb93a3d210c8f71b093 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 10 Nov 2021 11:01:25 +0100
Subject: objtool: Remove .fixup handling

The .fixup has gone the way of the Dodo, that test will always be
false.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/20211110101326.261496792@infradead.org
---
 tools/objtool/check.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index e28172f6e792..9fd81490ff07 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -3310,14 +3310,10 @@ static bool ignore_unreachable_insn(struct objtool_file *file, struct instructio
 		return true;
 
 	/*
-	 * Ignore any unused exceptions.  This can happen when a whitelisted
-	 * function has an exception table entry.
-	 *
-	 * Also ignore alternative replacement instructions.  This can happen
+	 * Ignore alternative replacement instructions.  This can happen
 	 * when a whitelisted function uses one of the ALTERNATIVE macros.
 	 */
-	if (!strcmp(insn->sec->name, ".fixup") ||
-	    !strcmp(insn->sec->name, ".altinstr_replacement") ||
+	if (!strcmp(insn->sec->name, ".altinstr_replacement") ||
 	    !strcmp(insn->sec->name, ".altinstr_aux"))
 		return true;
 
-- 
cgit v1.2.3


From c897899752478d4c905c56f2b54b99ba82b34e13 Mon Sep 17 00:00:00 2001
From: German Gomez <german.gomez@arm.com>
Date: Wed, 1 Dec 2021 12:33:29 +0000
Subject: perf tools: Prevent out-of-bounds access to registers

The size of the cache of register values is arch-dependant
(PERF_REGS_MAX). This has the potential of causing an out-of-bounds
access in the function "perf_reg_value" if the local architecture
contains less registers than the one the perf.data file was recorded on.

Since the maximum number of registers is bound by the bitmask "u64
cache_mask", and the size of the cache when running under x86 systems is
64 already, fix the size to 64 and add a range-check to the function
"perf_reg_value" to prevent out-of-bounds access.

Reported-by: Alexandre Truong <alexandre.truong@arm.com>
Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
Signed-off-by: German Gomez <german.gomez@arm.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-csky@vger.kernel.org
Cc: linux-riscv@lists.infradead.org
Link: https://lore.kernel.org/r/20211201123334.679131-2-german.gomez@arm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/event.h     | 5 ++++-
 tools/perf/util/perf_regs.c | 3 +++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 95ffed66369c..c59331eea1d9 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -44,13 +44,16 @@ struct perf_event_attr;
 /* perf sample has 16 bits size limit */
 #define PERF_SAMPLE_MAX_SIZE (1 << 16)
 
+/* number of register is bound by the number of bits in regs_dump::mask (64) */
+#define PERF_SAMPLE_REGS_CACHE_SIZE (8 * sizeof(u64))
+
 struct regs_dump {
 	u64 abi;
 	u64 mask;
 	u64 *regs;
 
 	/* Cached values/mask filled by first register access. */
-	u64 cache_regs[PERF_REGS_MAX];
+	u64 cache_regs[PERF_SAMPLE_REGS_CACHE_SIZE];
 	u64 cache_mask;
 };
 
diff --git a/tools/perf/util/perf_regs.c b/tools/perf/util/perf_regs.c
index 5ee47ae1509c..06a7461ba864 100644
--- a/tools/perf/util/perf_regs.c
+++ b/tools/perf/util/perf_regs.c
@@ -25,6 +25,9 @@ int perf_reg_value(u64 *valp, struct regs_dump *regs, int id)
 	int i, idx = 0;
 	u64 mask = regs->mask;
 
+	if ((u64)id >= PERF_SAMPLE_REGS_CACHE_SIZE)
+		return -EINVAL;
+
 	if (regs->cache_mask & (1ULL << id))
 		goto out;
 
-- 
cgit v1.2.3


From 057ae59f5a1d924511beb1b09f395bdb316cfd03 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Fri, 10 Dec 2021 18:22:57 +0200
Subject: perf intel-pt: Fix some PGE (packet generation enable/control flow
 packets) usage

Packet generation enable (PGE) refers to whether control flow (COFI)
packets are being produced.

PGE may be false even when branch-tracing is enabled, due to being
out-of-context, or outside a filter address range.  Fix some missing PGE
usage.

Fixes: 7c1b16ba0e26e6 ("perf intel-pt: Add support for decoding FUP/TIP only")
Fixes: 839598176b0554 ("perf intel-pt: Allow decoding with branch tracing disabled")
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: stable@vger.kernel.org # v5.15+
Link: https://lore.kernel.org/r/20211210162303.2288710-2-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/intel-pt-decoder/intel-pt-decoder.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
index 5f83937bf8f3..6f6f163161a9 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
@@ -2678,6 +2678,7 @@ static int intel_pt_hop_trace(struct intel_pt_decoder *decoder, bool *no_tip, in
 		return HOP_IGNORE;
 
 	case INTEL_PT_TIP_PGD:
+		decoder->pge = false;
 		if (!decoder->packet.count) {
 			intel_pt_set_nr(decoder);
 			return HOP_IGNORE;
@@ -2707,7 +2708,7 @@ static int intel_pt_hop_trace(struct intel_pt_decoder *decoder, bool *no_tip, in
 		intel_pt_set_ip(decoder);
 		if (intel_pt_fup_event(decoder))
 			return HOP_RETURN;
-		if (!decoder->branch_enable)
+		if (!decoder->branch_enable || !decoder->pge)
 			*no_tip = true;
 		if (*no_tip) {
 			decoder->state.type = INTEL_PT_INSTRUCTION;
@@ -2897,7 +2898,7 @@ static bool intel_pt_psb_with_fup(struct intel_pt_decoder *decoder, int *err)
 {
 	struct intel_pt_psb_info data = { .fup = false };
 
-	if (!decoder->branch_enable || !decoder->pge)
+	if (!decoder->branch_enable)
 		return false;
 
 	intel_pt_pkt_lookahead(decoder, intel_pt_psb_lookahead_cb, &data);
@@ -2999,7 +3000,7 @@ next:
 				break;
 			}
 			intel_pt_set_last_ip(decoder);
-			if (!decoder->branch_enable) {
+			if (!decoder->branch_enable || !decoder->pge) {
 				decoder->ip = decoder->last_ip;
 				if (intel_pt_fup_event(decoder))
 					return 0;
-- 
cgit v1.2.3


From ad106a26aef3a95ac7ca88d033b431661ba346ce Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Fri, 10 Dec 2021 18:22:58 +0200
Subject: perf intel-pt: Fix sync state when a PSB (synchronization) packet is
 found

When syncing, it may be that branch packet generation is not enabled at
that point, in which case there will not immediately be a control-flow
packet, so some packets before a control flow packet turns up, get
ignored.  However, the decoder is in sync as soon as a PSB is found, so
the state should be set accordingly.

Fixes: f4aa081949e7b6 ("perf tools: Add Intel PT decoder")
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: stable@vger.kernel.org # v5.15+
Link: https://lore.kernel.org/r/20211210162303.2288710-3-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/intel-pt-decoder/intel-pt-decoder.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
index 6f6f163161a9..bddf98123dc3 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
@@ -3608,7 +3608,7 @@ static int intel_pt_sync(struct intel_pt_decoder *decoder)
 	}
 
 	decoder->have_last_ip = true;
-	decoder->pkt_state = INTEL_PT_STATE_NO_IP;
+	decoder->pkt_state = INTEL_PT_STATE_IN_SYNC;
 
 	err = intel_pt_walk_psb(decoder);
 	if (err)
-- 
cgit v1.2.3


From 4c761d805bb2d2ead1b9baaba75496152b394c80 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Fri, 10 Dec 2021 18:22:59 +0200
Subject: perf intel-pt: Fix intel_pt_fup_event() assumptions about setting
 state type

intel_pt_fup_event() assumes it can overwrite the state type if there has
been an FUP event, but this is an unnecessary and unexpected constraint on
callers.

Fix by touching only the state type flags that are affected by an FUP
event.

Fixes: a472e65fc490a ("perf intel-pt: Add decoder support for ptwrite and power event packets")
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: stable@vger.kernel.org # v5.15+
Link: https://lore.kernel.org/r/20211210162303.2288710-4-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../perf/util/intel-pt-decoder/intel-pt-decoder.c  | 32 +++++++++-------------
 1 file changed, 13 insertions(+), 19 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
index bddf98123dc3..16fbbf07e367 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
@@ -1205,61 +1205,55 @@ out_no_progress:
 
 static bool intel_pt_fup_event(struct intel_pt_decoder *decoder)
 {
+	enum intel_pt_sample_type type = decoder->state.type;
 	bool ret = false;
 
+	decoder->state.type &= ~INTEL_PT_BRANCH;
+
 	if (decoder->set_fup_tx_flags) {
 		decoder->set_fup_tx_flags = false;
 		decoder->tx_flags = decoder->fup_tx_flags;
-		decoder->state.type = INTEL_PT_TRANSACTION;
+		decoder->state.type |= INTEL_PT_TRANSACTION;
 		if (decoder->fup_tx_flags & INTEL_PT_ABORT_TX)
 			decoder->state.type |= INTEL_PT_BRANCH;
-		decoder->state.from_ip = decoder->ip;
-		decoder->state.to_ip = 0;
 		decoder->state.flags = decoder->fup_tx_flags;
-		return true;
+		ret = true;
 	}
 	if (decoder->set_fup_ptw) {
 		decoder->set_fup_ptw = false;
-		decoder->state.type = INTEL_PT_PTW;
+		decoder->state.type |= INTEL_PT_PTW;
 		decoder->state.flags |= INTEL_PT_FUP_IP;
-		decoder->state.from_ip = decoder->ip;
-		decoder->state.to_ip = 0;
 		decoder->state.ptw_payload = decoder->fup_ptw_payload;
-		return true;
+		ret = true;
 	}
 	if (decoder->set_fup_mwait) {
 		decoder->set_fup_mwait = false;
-		decoder->state.type = INTEL_PT_MWAIT_OP;
-		decoder->state.from_ip = decoder->ip;
-		decoder->state.to_ip = 0;
+		decoder->state.type |= INTEL_PT_MWAIT_OP;
 		decoder->state.mwait_payload = decoder->fup_mwait_payload;
 		ret = true;
 	}
 	if (decoder->set_fup_pwre) {
 		decoder->set_fup_pwre = false;
 		decoder->state.type |= INTEL_PT_PWR_ENTRY;
-		decoder->state.type &= ~INTEL_PT_BRANCH;
-		decoder->state.from_ip = decoder->ip;
-		decoder->state.to_ip = 0;
 		decoder->state.pwre_payload = decoder->fup_pwre_payload;
 		ret = true;
 	}
 	if (decoder->set_fup_exstop) {
 		decoder->set_fup_exstop = false;
 		decoder->state.type |= INTEL_PT_EX_STOP;
-		decoder->state.type &= ~INTEL_PT_BRANCH;
 		decoder->state.flags |= INTEL_PT_FUP_IP;
-		decoder->state.from_ip = decoder->ip;
-		decoder->state.to_ip = 0;
 		ret = true;
 	}
 	if (decoder->set_fup_bep) {
 		decoder->set_fup_bep = false;
 		decoder->state.type |= INTEL_PT_BLK_ITEMS;
-		decoder->state.type &= ~INTEL_PT_BRANCH;
+		ret = true;
+	}
+	if (ret) {
 		decoder->state.from_ip = decoder->ip;
 		decoder->state.to_ip = 0;
-		ret = true;
+	} else {
+		decoder->state.type = type;
 	}
 	return ret;
 }
-- 
cgit v1.2.3


From c79ee2b2160909889df67c8801352d3e69d43a1a Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Fri, 10 Dec 2021 18:23:00 +0200
Subject: perf intel-pt: Fix state setting when receiving overflow (OVF) packet

An overflow (OVF packet) is treated as an error because it represents a
loss of trace data, but there is no loss of synchronization, so the packet
state should be INTEL_PT_STATE_IN_SYNC not INTEL_PT_STATE_ERR_RESYNC.

To support that, some additional variables must be reset, and the FUP
packet that may follow OVF is treated as an FUP event.

Fixes: f4aa081949e7b6 ("perf tools: Add Intel PT decoder")
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: stable@vger.kernel.org # v5.15+
Link: https://lore.kernel.org/r/20211210162303.2288710-5-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../perf/util/intel-pt-decoder/intel-pt-decoder.c  | 32 +++++++++++++++++++---
 1 file changed, 28 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
index 16fbbf07e367..845b0ca866a4 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
@@ -1249,6 +1249,20 @@ static bool intel_pt_fup_event(struct intel_pt_decoder *decoder)
 		decoder->state.type |= INTEL_PT_BLK_ITEMS;
 		ret = true;
 	}
+	if (decoder->overflow) {
+		decoder->overflow = false;
+		if (!ret && !decoder->pge) {
+			if (decoder->hop) {
+				decoder->state.type = 0;
+				decoder->pkt_state = INTEL_PT_STATE_RESAMPLE;
+			}
+			decoder->pge = true;
+			decoder->state.type |= INTEL_PT_BRANCH | INTEL_PT_TRACE_BEGIN;
+			decoder->state.from_ip = 0;
+			decoder->state.to_ip = decoder->ip;
+			return true;
+		}
+	}
 	if (ret) {
 		decoder->state.from_ip = decoder->ip;
 		decoder->state.to_ip = 0;
@@ -1602,7 +1616,16 @@ static int intel_pt_overflow(struct intel_pt_decoder *decoder)
 	intel_pt_clear_tx_flags(decoder);
 	intel_pt_set_nr(decoder);
 	decoder->timestamp_insn_cnt = 0;
-	decoder->pkt_state = INTEL_PT_STATE_ERR_RESYNC;
+	decoder->pkt_state = INTEL_PT_STATE_IN_SYNC;
+	decoder->state.from_ip = decoder->ip;
+	decoder->ip = 0;
+	decoder->pge = false;
+	decoder->set_fup_tx_flags = false;
+	decoder->set_fup_ptw = false;
+	decoder->set_fup_mwait = false;
+	decoder->set_fup_pwre = false;
+	decoder->set_fup_exstop = false;
+	decoder->set_fup_bep = false;
 	decoder->overflow = true;
 	return -EOVERFLOW;
 }
@@ -2957,6 +2980,7 @@ next:
 
 		case INTEL_PT_TIP_PGE: {
 			decoder->pge = true;
+			decoder->overflow = false;
 			intel_pt_mtc_cyc_cnt_pge(decoder);
 			intel_pt_set_nr(decoder);
 			if (decoder->packet.count == 0) {
@@ -3462,10 +3486,10 @@ static int intel_pt_sync_ip(struct intel_pt_decoder *decoder)
 	decoder->set_fup_pwre = false;
 	decoder->set_fup_exstop = false;
 	decoder->set_fup_bep = false;
+	decoder->overflow = false;
 
 	if (!decoder->branch_enable) {
 		decoder->pkt_state = INTEL_PT_STATE_IN_SYNC;
-		decoder->overflow = false;
 		decoder->state.type = 0; /* Do not have a sample */
 		return 0;
 	}
@@ -3480,7 +3504,6 @@ static int intel_pt_sync_ip(struct intel_pt_decoder *decoder)
 		decoder->pkt_state = INTEL_PT_STATE_RESAMPLE;
 	else
 		decoder->pkt_state = INTEL_PT_STATE_IN_SYNC;
-	decoder->overflow = false;
 
 	decoder->state.from_ip = 0;
 	decoder->state.to_ip = decoder->ip;
@@ -3699,7 +3722,8 @@ const struct intel_pt_state *intel_pt_decode(struct intel_pt_decoder *decoder)
 
 	if (err) {
 		decoder->state.err = intel_pt_ext_err(err);
-		decoder->state.from_ip = decoder->ip;
+		if (err != -EOVERFLOW)
+			decoder->state.from_ip = decoder->ip;
 		intel_pt_update_sample_time(decoder);
 		decoder->sample_tot_cyc_cnt = decoder->tot_cyc_cnt;
 		intel_pt_set_nr(decoder);
-- 
cgit v1.2.3


From a32e6c5da599dbf49e60622a4dfb5b9b40ece029 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Fri, 10 Dec 2021 18:23:01 +0200
Subject: perf intel-pt: Fix next 'err' value, walking trace

Code after label 'next:' in intel_pt_walk_trace() assumes 'err' is zero,
but it may not be, if arrived at via a 'goto'. Ensure it is zero.

Fixes: 7c1b16ba0e26e6 ("perf intel-pt: Add support for decoding FUP/TIP only")
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: stable@vger.kernel.org # v5.15+
Link: https://lore.kernel.org/r/20211210162303.2288710-6-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/intel-pt-decoder/intel-pt-decoder.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
index 845b0ca866a4..75b504aed7f4 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
@@ -2942,6 +2942,7 @@ static int intel_pt_walk_trace(struct intel_pt_decoder *decoder)
 		if (err)
 			return err;
 next:
+		err = 0;
 		if (decoder->cyc_threshold) {
 			if (decoder->sample_cyc && last_packet_type != INTEL_PT_CYC)
 				decoder->sample_cyc = false;
-- 
cgit v1.2.3


From a882cc94971093e146ffa1163b140ad956236754 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Fri, 10 Dec 2021 18:23:02 +0200
Subject: perf intel-pt: Fix missing 'instruction' events with 'q' option

FUP packets contain IP information, which makes them also an 'instruction'
event in 'hop' mode i.e. the itrace 'q' option.  That wasn't happening, so
restructure the logic so that FUP events are added along with appropriate
'instruction' and 'branch' events.

Fixes: 7c1b16ba0e26e6 ("perf intel-pt: Add support for decoding FUP/TIP only")
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: stable@vger.kernel.org # v5.15+
Link: https://lore.kernel.org/r/20211210162303.2288710-7-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/intel-pt-decoder/intel-pt-decoder.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
index 75b504aed7f4..0e013c2d9eb4 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
@@ -2683,6 +2683,8 @@ static int intel_pt_scan_for_psb(struct intel_pt_decoder *decoder);
 /* Hop mode: Ignore TNT, do not walk code, but get ip from FUPs and TIPs */
 static int intel_pt_hop_trace(struct intel_pt_decoder *decoder, bool *no_tip, int *err)
 {
+	*err = 0;
+
 	/* Leap from PSB to PSB, getting ip from FUP within PSB+ */
 	if (decoder->leap && !decoder->in_psb && decoder->packet.type != INTEL_PT_PSB) {
 		*err = intel_pt_scan_for_psb(decoder);
@@ -2723,18 +2725,21 @@ static int intel_pt_hop_trace(struct intel_pt_decoder *decoder, bool *no_tip, in
 		if (!decoder->packet.count)
 			return HOP_IGNORE;
 		intel_pt_set_ip(decoder);
-		if (intel_pt_fup_event(decoder))
-			return HOP_RETURN;
+		if (decoder->set_fup_mwait || decoder->set_fup_pwre)
+			*no_tip = true;
 		if (!decoder->branch_enable || !decoder->pge)
 			*no_tip = true;
 		if (*no_tip) {
 			decoder->state.type = INTEL_PT_INSTRUCTION;
 			decoder->state.from_ip = decoder->ip;
 			decoder->state.to_ip = 0;
+			intel_pt_fup_event(decoder);
 			return HOP_RETURN;
 		}
+		intel_pt_fup_event(decoder);
+		decoder->state.type |= INTEL_PT_INSTRUCTION | INTEL_PT_BRANCH;
 		*err = intel_pt_walk_fup_tip(decoder);
-		if (!*err)
+		if (!*err && decoder->state.to_ip)
 			decoder->pkt_state = INTEL_PT_STATE_RESAMPLE;
 		return HOP_RETURN;
 
-- 
cgit v1.2.3


From 6665b8e4836caa8023cbc7e53733acd234969c8c Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Fri, 10 Dec 2021 18:23:03 +0200
Subject: perf intel-pt: Fix error timestamp setting on the decoder error path

An error timestamp shows the last known timestamp for the queue, but this
is not updated on the error path. Fix by setting it.

Fixes: f4aa081949e7b6 ("perf tools: Add Intel PT decoder")
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: stable@vger.kernel.org # v5.15+
Link: https://lore.kernel.org/r/20211210162303.2288710-8-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/intel-pt.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
index 556a893508da..10c3187e4c5a 100644
--- a/tools/perf/util/intel-pt.c
+++ b/tools/perf/util/intel-pt.c
@@ -2565,6 +2565,7 @@ static int intel_pt_run_decoder(struct intel_pt_queue *ptq, u64 *timestamp)
 				ptq->sync_switch = false;
 				intel_pt_next_tid(pt, ptq);
 			}
+			ptq->timestamp = state->est_timestamp;
 			if (pt->synth_opts.errors) {
 				err = intel_ptq_synth_error(ptq, state);
 				if (err)
-- 
cgit v1.2.3


From 9937e8daab29d9e20de6b7bc56c76db7a4eeda69 Mon Sep 17 00:00:00 2001
From: Miaoqian Lin <linmq006@gmail.com>
Date: Sat, 11 Dec 2021 05:38:53 +0000
Subject: perf python: Fix NULL vs IS_ERR_OR_NULL() checking

The function trace_event__tp_format_id may return ERR_PTR(-ENOMEM).  Use
IS_ERR_OR_NULL to check tp_format.

Signed-off-by: Miaoqian Lin <linmq006@gmail.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Link: http://lore.kernel.org/lkml/20211211053856.19827-1-linmq006@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/python.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c
index 563a9ba8954f..7f782a31bda3 100644
--- a/tools/perf/util/python.c
+++ b/tools/perf/util/python.c
@@ -461,7 +461,7 @@ get_tracepoint_field(struct pyrf_event *pevent, PyObject *attr_name)
 		struct tep_event *tp_format;
 
 		tp_format = trace_event__tp_format_id(evsel->core.attr.config);
-		if (!tp_format)
+		if (IS_ERR_OR_NULL(tp_format))
 			return NULL;
 
 		evsel->tp_format = tp_format;
-- 
cgit v1.2.3


From 259172bb6514758ce3be1610c500b51a9f44212a Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Sat, 11 Dec 2021 17:16:19 -0800
Subject: libbpf: Fix gen_loader assumption on number of programs.

libbpf's obj->nr_programs includes static and global functions. That number
could be higher than the actual number of bpf programs going be loaded by
gen_loader. Passing larger nr_programs to bpf_gen__init() doesn't hurt. Those
exra stack slots will stay as zero. bpf_gen__finish() needs to check that
actual number of progs that gen_loader saw is less than or equal to
obj->nr_programs.

Fixes: ba05fd36b851 ("libbpf: Perform map fd cleanup for gen_loader in case of error")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/lib/bpf/gen_loader.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c
index 8ed02e89c9a9..c5859c21d3e9 100644
--- a/tools/lib/bpf/gen_loader.c
+++ b/tools/lib/bpf/gen_loader.c
@@ -371,8 +371,9 @@ int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps)
 {
 	int i;
 
-	if (nr_progs != gen->nr_progs || nr_maps != gen->nr_maps) {
-		pr_warn("progs/maps mismatch\n");
+	if (nr_progs < gen->nr_progs || nr_maps != gen->nr_maps) {
+		pr_warn("nr_progs %d/%d nr_maps %d/%d mismatch\n",
+			nr_progs, gen->nr_progs, nr_maps, gen->nr_maps);
 		gen->error = -EFAULT;
 		return gen->error;
 	}
-- 
cgit v1.2.3


From c5fb19937455095573a19ddcbff32e993ed10e35 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 10 Dec 2021 22:16:49 +0800
Subject: bpf: Add bpf_strncmp helper

The helper compares two strings: one string is a null-terminated
read-only string, and another string has const max storage size
but doesn't need to be null-terminated. It can be used to compare
file name in tracing or LSM program.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211210141652.877186-2-houtao1@huawei.com
---
 include/linux/bpf.h            |  1 +
 include/uapi/linux/bpf.h       | 11 +++++++++++
 kernel/bpf/helpers.c           | 16 ++++++++++++++++
 tools/include/uapi/linux/bpf.h | 11 +++++++++++
 4 files changed, 39 insertions(+)

(limited to 'tools')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0ceb54c6342f..7a40022e3d00 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2163,6 +2163,7 @@ extern const struct bpf_func_proto bpf_sk_getsockopt_proto;
 extern const struct bpf_func_proto bpf_kallsyms_lookup_name_proto;
 extern const struct bpf_func_proto bpf_find_vma_proto;
 extern const struct bpf_func_proto bpf_loop_proto;
+extern const struct bpf_func_proto bpf_strncmp_proto;
 
 const struct bpf_func_proto *tracing_prog_func_proto(
   enum bpf_func_id func_id, const struct bpf_prog *prog);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c26871263f1f..2820c77e4846 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4983,6 +4983,16 @@ union bpf_attr {
  *	Return
  *		The number of loops performed, **-EINVAL** for invalid **flags**,
  *		**-E2BIG** if **nr_loops** exceeds the maximum number of loops.
+ *
+ * long bpf_strncmp(const char *s1, u32 s1_sz, const char *s2)
+ *	Description
+ *		Do strncmp() between **s1** and **s2**. **s1** doesn't need
+ *		to be null-terminated and **s1_sz** is the maximum storage
+ *		size of **s1**. **s2** must be a read-only string.
+ *	Return
+ *		An integer less than, equal to, or greater than zero
+ *		if the first **s1_sz** bytes of **s1** is found to be
+ *		less than, to match, or be greater than **s2**.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5167,6 +5177,7 @@ union bpf_attr {
 	FN(kallsyms_lookup_name),	\
 	FN(find_vma),			\
 	FN(loop),			\
+	FN(strncmp),			\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 19fecfeaa9c2..8babae03d30a 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -565,6 +565,20 @@ const struct bpf_func_proto bpf_strtoul_proto = {
 };
 #endif
 
+BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2)
+{
+	return strncmp(s1, s2, s1_sz);
+}
+
+const struct bpf_func_proto bpf_strncmp_proto = {
+	.func		= bpf_strncmp,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_CONST_SIZE,
+	.arg3_type	= ARG_PTR_TO_CONST_STR,
+};
+
 BPF_CALL_4(bpf_get_ns_current_pid_tgid, u64, dev, u64, ino,
 	   struct bpf_pidns_info *, nsdata, u32, size)
 {
@@ -1378,6 +1392,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_for_each_map_elem_proto;
 	case BPF_FUNC_loop:
 		return &bpf_loop_proto;
+	case BPF_FUNC_strncmp:
+		return &bpf_strncmp_proto;
 	default:
 		break;
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index c26871263f1f..2820c77e4846 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4983,6 +4983,16 @@ union bpf_attr {
  *	Return
  *		The number of loops performed, **-EINVAL** for invalid **flags**,
  *		**-E2BIG** if **nr_loops** exceeds the maximum number of loops.
+ *
+ * long bpf_strncmp(const char *s1, u32 s1_sz, const char *s2)
+ *	Description
+ *		Do strncmp() between **s1** and **s2**. **s1** doesn't need
+ *		to be null-terminated and **s1_sz** is the maximum storage
+ *		size of **s1**. **s2** must be a read-only string.
+ *	Return
+ *		An integer less than, equal to, or greater than zero
+ *		if the first **s1_sz** bytes of **s1** is found to be
+ *		less than, to match, or be greater than **s2**.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5167,6 +5177,7 @@ union bpf_attr {
 	FN(kallsyms_lookup_name),	\
 	FN(find_vma),			\
 	FN(loop),			\
+	FN(strncmp),			\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3


From 9a93bf3fda3d03762868b1424e898395ffc71575 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 10 Dec 2021 22:16:50 +0800
Subject: selftests/bpf: Fix checkpatch error on empty function parameter

Fix checkpatch error: "ERROR: Bad function definition - void foo()
should probably be void foo(void)". Most replacements are done by
the following command:

  sed -i 's#\([a-z]\)()$#\1(void)#g' testing/selftests/bpf/benchs/*.c

Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211210141652.877186-3-houtao1@huawei.com
---
 tools/testing/selftests/bpf/bench.c                |  2 +-
 tools/testing/selftests/bpf/bench.h                |  9 ++++----
 tools/testing/selftests/bpf/benchs/bench_count.c   |  2 +-
 tools/testing/selftests/bpf/benchs/bench_rename.c  | 16 +++++++--------
 .../testing/selftests/bpf/benchs/bench_ringbufs.c  | 14 ++++++-------
 tools/testing/selftests/bpf/benchs/bench_trigger.c | 24 +++++++++++-----------
 6 files changed, 34 insertions(+), 33 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index 3d6082b97a56..ffe5752f3324 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -39,7 +39,7 @@ static int bump_memlock_rlimit(void)
 	return setrlimit(RLIMIT_MEMLOCK, &rlim_new);
 }
 
-void setup_libbpf()
+void setup_libbpf(void)
 {
 	int err;
 
diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h
index 50785503756b..fb3e213df3dc 100644
--- a/tools/testing/selftests/bpf/bench.h
+++ b/tools/testing/selftests/bpf/bench.h
@@ -38,8 +38,8 @@ struct bench_res {
 
 struct bench {
 	const char *name;
-	void (*validate)();
-	void (*setup)();
+	void (*validate)(void);
+	void (*setup)(void);
 	void *(*producer_thread)(void *ctx);
 	void *(*consumer_thread)(void *ctx);
 	void (*measure)(struct bench_res* res);
@@ -54,7 +54,7 @@ struct counter {
 extern struct env env;
 extern const struct bench *bench;
 
-void setup_libbpf();
+void setup_libbpf(void);
 void hits_drops_report_progress(int iter, struct bench_res *res, long delta_ns);
 void hits_drops_report_final(struct bench_res res[], int res_cnt);
 void false_hits_report_progress(int iter, struct bench_res *res, long delta_ns);
@@ -62,7 +62,8 @@ void false_hits_report_final(struct bench_res res[], int res_cnt);
 void ops_report_progress(int iter, struct bench_res *res, long delta_ns);
 void ops_report_final(struct bench_res res[], int res_cnt);
 
-static inline __u64 get_time_ns() {
+static inline __u64 get_time_ns(void)
+{
 	struct timespec t;
 
 	clock_gettime(CLOCK_MONOTONIC, &t);
diff --git a/tools/testing/selftests/bpf/benchs/bench_count.c b/tools/testing/selftests/bpf/benchs/bench_count.c
index befba7a82643..078972ce208e 100644
--- a/tools/testing/selftests/bpf/benchs/bench_count.c
+++ b/tools/testing/selftests/bpf/benchs/bench_count.c
@@ -36,7 +36,7 @@ static struct count_local_ctx {
 	struct counter *hits;
 } count_local_ctx;
 
-static void count_local_setup()
+static void count_local_setup(void)
 {
 	struct count_local_ctx *ctx = &count_local_ctx;
 
diff --git a/tools/testing/selftests/bpf/benchs/bench_rename.c b/tools/testing/selftests/bpf/benchs/bench_rename.c
index c7ec114eca56..3c203b6d6a6e 100644
--- a/tools/testing/selftests/bpf/benchs/bench_rename.c
+++ b/tools/testing/selftests/bpf/benchs/bench_rename.c
@@ -11,7 +11,7 @@ static struct ctx {
 	int fd;
 } ctx;
 
-static void validate()
+static void validate(void)
 {
 	if (env.producer_cnt != 1) {
 		fprintf(stderr, "benchmark doesn't support multi-producer!\n");
@@ -43,7 +43,7 @@ static void measure(struct bench_res *res)
 	res->hits = atomic_swap(&ctx.hits.value, 0);
 }
 
-static void setup_ctx()
+static void setup_ctx(void)
 {
 	setup_libbpf();
 
@@ -71,36 +71,36 @@ static void attach_bpf(struct bpf_program *prog)
 	}
 }
 
-static void setup_base()
+static void setup_base(void)
 {
 	setup_ctx();
 }
 
-static void setup_kprobe()
+static void setup_kprobe(void)
 {
 	setup_ctx();
 	attach_bpf(ctx.skel->progs.prog1);
 }
 
-static void setup_kretprobe()
+static void setup_kretprobe(void)
 {
 	setup_ctx();
 	attach_bpf(ctx.skel->progs.prog2);
 }
 
-static void setup_rawtp()
+static void setup_rawtp(void)
 {
 	setup_ctx();
 	attach_bpf(ctx.skel->progs.prog3);
 }
 
-static void setup_fentry()
+static void setup_fentry(void)
 {
 	setup_ctx();
 	attach_bpf(ctx.skel->progs.prog4);
 }
 
-static void setup_fexit()
+static void setup_fexit(void)
 {
 	setup_ctx();
 	attach_bpf(ctx.skel->progs.prog5);
diff --git a/tools/testing/selftests/bpf/benchs/bench_ringbufs.c b/tools/testing/selftests/bpf/benchs/bench_ringbufs.c
index 52d4a2f91dbd..da8593b3494a 100644
--- a/tools/testing/selftests/bpf/benchs/bench_ringbufs.c
+++ b/tools/testing/selftests/bpf/benchs/bench_ringbufs.c
@@ -88,12 +88,12 @@ const struct argp bench_ringbufs_argp = {
 
 static struct counter buf_hits;
 
-static inline void bufs_trigger_batch()
+static inline void bufs_trigger_batch(void)
 {
 	(void)syscall(__NR_getpgid);
 }
 
-static void bufs_validate()
+static void bufs_validate(void)
 {
 	if (env.consumer_cnt != 1) {
 		fprintf(stderr, "rb-libbpf benchmark doesn't support multi-consumer!\n");
@@ -132,7 +132,7 @@ static void ringbuf_libbpf_measure(struct bench_res *res)
 	res->drops = atomic_swap(&ctx->skel->bss->dropped, 0);
 }
 
-static struct ringbuf_bench *ringbuf_setup_skeleton()
+static struct ringbuf_bench *ringbuf_setup_skeleton(void)
 {
 	struct ringbuf_bench *skel;
 
@@ -167,7 +167,7 @@ static int buf_process_sample(void *ctx, void *data, size_t len)
 	return 0;
 }
 
-static void ringbuf_libbpf_setup()
+static void ringbuf_libbpf_setup(void)
 {
 	struct ringbuf_libbpf_ctx *ctx = &ringbuf_libbpf_ctx;
 	struct bpf_link *link;
@@ -223,7 +223,7 @@ static void ringbuf_custom_measure(struct bench_res *res)
 	res->drops = atomic_swap(&ctx->skel->bss->dropped, 0);
 }
 
-static void ringbuf_custom_setup()
+static void ringbuf_custom_setup(void)
 {
 	struct ringbuf_custom_ctx *ctx = &ringbuf_custom_ctx;
 	const size_t page_size = getpagesize();
@@ -352,7 +352,7 @@ static void perfbuf_measure(struct bench_res *res)
 	res->drops = atomic_swap(&ctx->skel->bss->dropped, 0);
 }
 
-static struct perfbuf_bench *perfbuf_setup_skeleton()
+static struct perfbuf_bench *perfbuf_setup_skeleton(void)
 {
 	struct perfbuf_bench *skel;
 
@@ -390,7 +390,7 @@ perfbuf_process_sample_raw(void *input_ctx, int cpu,
 	return LIBBPF_PERF_EVENT_CONT;
 }
 
-static void perfbuf_libbpf_setup()
+static void perfbuf_libbpf_setup(void)
 {
 	struct perfbuf_libbpf_ctx *ctx = &perfbuf_libbpf_ctx;
 	struct perf_event_attr attr;
diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c
index 049a5ad56f65..7f957c55a3ca 100644
--- a/tools/testing/selftests/bpf/benchs/bench_trigger.c
+++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c
@@ -11,7 +11,7 @@ static struct trigger_ctx {
 
 static struct counter base_hits;
 
-static void trigger_validate()
+static void trigger_validate(void)
 {
 	if (env.consumer_cnt != 1) {
 		fprintf(stderr, "benchmark doesn't support multi-consumer!\n");
@@ -45,7 +45,7 @@ static void trigger_measure(struct bench_res *res)
 	res->hits = atomic_swap(&ctx.skel->bss->hits, 0);
 }
 
-static void setup_ctx()
+static void setup_ctx(void)
 {
 	setup_libbpf();
 
@@ -67,37 +67,37 @@ static void attach_bpf(struct bpf_program *prog)
 	}
 }
 
-static void trigger_tp_setup()
+static void trigger_tp_setup(void)
 {
 	setup_ctx();
 	attach_bpf(ctx.skel->progs.bench_trigger_tp);
 }
 
-static void trigger_rawtp_setup()
+static void trigger_rawtp_setup(void)
 {
 	setup_ctx();
 	attach_bpf(ctx.skel->progs.bench_trigger_raw_tp);
 }
 
-static void trigger_kprobe_setup()
+static void trigger_kprobe_setup(void)
 {
 	setup_ctx();
 	attach_bpf(ctx.skel->progs.bench_trigger_kprobe);
 }
 
-static void trigger_fentry_setup()
+static void trigger_fentry_setup(void)
 {
 	setup_ctx();
 	attach_bpf(ctx.skel->progs.bench_trigger_fentry);
 }
 
-static void trigger_fentry_sleep_setup()
+static void trigger_fentry_sleep_setup(void)
 {
 	setup_ctx();
 	attach_bpf(ctx.skel->progs.bench_trigger_fentry_sleep);
 }
 
-static void trigger_fmodret_setup()
+static void trigger_fmodret_setup(void)
 {
 	setup_ctx();
 	attach_bpf(ctx.skel->progs.bench_trigger_fmodret);
@@ -183,22 +183,22 @@ static void usetup(bool use_retprobe, bool use_nop)
 	ctx.skel->links.bench_trigger_uprobe = link;
 }
 
-static void uprobe_setup_with_nop()
+static void uprobe_setup_with_nop(void)
 {
 	usetup(false, true);
 }
 
-static void uretprobe_setup_with_nop()
+static void uretprobe_setup_with_nop(void)
 {
 	usetup(true, true);
 }
 
-static void uprobe_setup_without_nop()
+static void uprobe_setup_without_nop(void)
 {
 	usetup(false, false);
 }
 
-static void uretprobe_setup_without_nop()
+static void uretprobe_setup_without_nop(void)
 {
 	usetup(true, false);
 }
-- 
cgit v1.2.3


From 9c42652f8be3202ad11cf4fbc358688003cff21c Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 10 Dec 2021 22:16:51 +0800
Subject: selftests/bpf: Add benchmark for bpf_strncmp() helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add benchmark to compare the performance between home-made strncmp()
in bpf program and bpf_strncmp() helper. In summary, the performance
win of bpf_strncmp() under x86-64 is greater than 18% when the compared
string length is greater than 64, and is 179% when the length is 4095.
Under arm64 the performance win is even bigger: 33% when the length
is greater than 64 and 600% when the length is 4095.

The following is the details:

no-helper-X: use home-made strncmp() to compare X-sized string
helper-Y: use bpf_strncmp() to compare Y-sized string

Under x86-64:

no-helper-1          3.504 ± 0.000M/s (drops 0.000 ± 0.000M/s)
helper-1             3.347 ± 0.001M/s (drops 0.000 ± 0.000M/s)

no-helper-8          3.357 ± 0.001M/s (drops 0.000 ± 0.000M/s)
helper-8             3.307 ± 0.001M/s (drops 0.000 ± 0.000M/s)

no-helper-32         3.064 ± 0.000M/s (drops 0.000 ± 0.000M/s)
helper-32            3.253 ± 0.001M/s (drops 0.000 ± 0.000M/s)

no-helper-64         2.563 ± 0.001M/s (drops 0.000 ± 0.000M/s)
helper-64            3.040 ± 0.001M/s (drops 0.000 ± 0.000M/s)

no-helper-128        1.975 ± 0.000M/s (drops 0.000 ± 0.000M/s)
helper-128           2.641 ± 0.000M/s (drops 0.000 ± 0.000M/s)

no-helper-512        0.759 ± 0.000M/s (drops 0.000 ± 0.000M/s)
helper-512           1.574 ± 0.000M/s (drops 0.000 ± 0.000M/s)

no-helper-2048       0.329 ± 0.000M/s (drops 0.000 ± 0.000M/s)
helper-2048          0.602 ± 0.000M/s (drops 0.000 ± 0.000M/s)

no-helper-4095       0.117 ± 0.000M/s (drops 0.000 ± 0.000M/s)
helper-4095          0.327 ± 0.000M/s (drops 0.000 ± 0.000M/s)

Under arm64:

no-helper-1          2.806 ± 0.004M/s (drops 0.000 ± 0.000M/s)
helper-1             2.819 ± 0.002M/s (drops 0.000 ± 0.000M/s)

no-helper-8          2.797 ± 0.109M/s (drops 0.000 ± 0.000M/s)
helper-8             2.786 ± 0.025M/s (drops 0.000 ± 0.000M/s)

no-helper-32         2.399 ± 0.011M/s (drops 0.000 ± 0.000M/s)
helper-32            2.703 ± 0.002M/s (drops 0.000 ± 0.000M/s)

no-helper-64         2.020 ± 0.015M/s (drops 0.000 ± 0.000M/s)
helper-64            2.702 ± 0.073M/s (drops 0.000 ± 0.000M/s)

no-helper-128        1.604 ± 0.001M/s (drops 0.000 ± 0.000M/s)
helper-128           2.516 ± 0.002M/s (drops 0.000 ± 0.000M/s)

no-helper-512        0.699 ± 0.000M/s (drops 0.000 ± 0.000M/s)
helper-512           2.106 ± 0.003M/s (drops 0.000 ± 0.000M/s)

no-helper-2048       0.215 ± 0.000M/s (drops 0.000 ± 0.000M/s)
helper-2048          1.223 ± 0.003M/s (drops 0.000 ± 0.000M/s)

no-helper-4095       0.112 ± 0.000M/s (drops 0.000 ± 0.000M/s)
helper-4095          0.796 ± 0.000M/s (drops 0.000 ± 0.000M/s)

Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211210141652.877186-4-houtao1@huawei.com
---
 tools/testing/selftests/bpf/Makefile               |   4 +-
 tools/testing/selftests/bpf/bench.c                |   6 +
 tools/testing/selftests/bpf/benchs/bench_strncmp.c | 161 +++++++++++++++++++++
 .../selftests/bpf/benchs/run_bench_strncmp.sh      |  12 ++
 tools/testing/selftests/bpf/progs/strncmp_bench.c  |  50 +++++++
 5 files changed, 232 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/benchs/bench_strncmp.c
 create mode 100755 tools/testing/selftests/bpf/benchs/run_bench_strncmp.sh
 create mode 100644 tools/testing/selftests/bpf/progs/strncmp_bench.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index a795bca4c8ec..43f6f3159431 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -537,6 +537,7 @@ $(OUTPUT)/bench_ringbufs.o: $(OUTPUT)/ringbuf_bench.skel.h \
 			    $(OUTPUT)/perfbuf_bench.skel.h
 $(OUTPUT)/bench_bloom_filter_map.o: $(OUTPUT)/bloom_filter_bench.skel.h
 $(OUTPUT)/bench_bpf_loop.o: $(OUTPUT)/bpf_loop_bench.skel.h
+$(OUTPUT)/bench_strncmp.o: $(OUTPUT)/strncmp_bench.skel.h
 $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ)
 $(OUTPUT)/bench: LDLIBS += -lm
 $(OUTPUT)/bench: $(OUTPUT)/bench.o \
@@ -547,7 +548,8 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \
 		 $(OUTPUT)/bench_trigger.o \
 		 $(OUTPUT)/bench_ringbufs.o \
 		 $(OUTPUT)/bench_bloom_filter_map.o \
-		 $(OUTPUT)/bench_bpf_loop.o
+		 $(OUTPUT)/bench_bpf_loop.o \
+		 $(OUTPUT)/bench_strncmp.o
 	$(call msg,BINARY,,$@)
 	$(Q)$(CC) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@
 
diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index ffe5752f3324..bbb42e2cee0c 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -205,11 +205,13 @@ static const struct argp_option opts[] = {
 extern struct argp bench_ringbufs_argp;
 extern struct argp bench_bloom_map_argp;
 extern struct argp bench_bpf_loop_argp;
+extern struct argp bench_strncmp_argp;
 
 static const struct argp_child bench_parsers[] = {
 	{ &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 },
 	{ &bench_bloom_map_argp, 0, "Bloom filter map benchmark", 0 },
 	{ &bench_bpf_loop_argp, 0, "bpf_loop helper benchmark", 0 },
+	{ &bench_strncmp_argp, 0, "bpf_strncmp helper benchmark", 0 },
 	{},
 };
 
@@ -409,6 +411,8 @@ extern const struct bench bench_bloom_false_positive;
 extern const struct bench bench_hashmap_without_bloom;
 extern const struct bench bench_hashmap_with_bloom;
 extern const struct bench bench_bpf_loop;
+extern const struct bench bench_strncmp_no_helper;
+extern const struct bench bench_strncmp_helper;
 
 static const struct bench *benchs[] = {
 	&bench_count_global,
@@ -441,6 +445,8 @@ static const struct bench *benchs[] = {
 	&bench_hashmap_without_bloom,
 	&bench_hashmap_with_bloom,
 	&bench_bpf_loop,
+	&bench_strncmp_no_helper,
+	&bench_strncmp_helper,
 };
 
 static void setup_benchmark()
diff --git a/tools/testing/selftests/bpf/benchs/bench_strncmp.c b/tools/testing/selftests/bpf/benchs/bench_strncmp.c
new file mode 100644
index 000000000000..494b591c0289
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/bench_strncmp.c
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2021. Huawei Technologies Co., Ltd */
+#include <argp.h>
+#include "bench.h"
+#include "strncmp_bench.skel.h"
+
+static struct strncmp_ctx {
+	struct strncmp_bench *skel;
+} ctx;
+
+static struct strncmp_args {
+	u32 cmp_str_len;
+} args = {
+	.cmp_str_len = 32,
+};
+
+enum {
+	ARG_CMP_STR_LEN = 5000,
+};
+
+static const struct argp_option opts[] = {
+	{ "cmp-str-len", ARG_CMP_STR_LEN, "CMP_STR_LEN", 0,
+	  "Set the length of compared string" },
+	{},
+};
+
+static error_t strncmp_parse_arg(int key, char *arg, struct argp_state *state)
+{
+	switch (key) {
+	case ARG_CMP_STR_LEN:
+		args.cmp_str_len = strtoul(arg, NULL, 10);
+		if (!args.cmp_str_len ||
+		    args.cmp_str_len >= sizeof(ctx.skel->bss->str)) {
+			fprintf(stderr, "Invalid cmp str len (limit %zu)\n",
+				sizeof(ctx.skel->bss->str));
+			argp_usage(state);
+		}
+		break;
+	default:
+		return ARGP_ERR_UNKNOWN;
+	}
+
+	return 0;
+}
+
+const struct argp bench_strncmp_argp = {
+	.options = opts,
+	.parser = strncmp_parse_arg,
+};
+
+static void strncmp_validate(void)
+{
+	if (env.consumer_cnt != 1) {
+		fprintf(stderr, "strncmp benchmark doesn't support multi-consumer!\n");
+		exit(1);
+	}
+}
+
+static void strncmp_setup(void)
+{
+	int err;
+	char *target;
+	size_t i, sz;
+
+	sz = sizeof(ctx.skel->rodata->target);
+	if (!sz || sz < sizeof(ctx.skel->bss->str)) {
+		fprintf(stderr, "invalid string size (target %zu, src %zu)\n",
+			sz, sizeof(ctx.skel->bss->str));
+		exit(1);
+	}
+
+	setup_libbpf();
+
+	ctx.skel = strncmp_bench__open();
+	if (!ctx.skel) {
+		fprintf(stderr, "failed to open skeleton\n");
+		exit(1);
+	}
+
+	srandom(time(NULL));
+	target = ctx.skel->rodata->target;
+	for (i = 0; i < sz - 1; i++)
+		target[i] = '1' + random() % 9;
+	target[sz - 1] = '\0';
+
+	ctx.skel->rodata->cmp_str_len = args.cmp_str_len;
+
+	memcpy(ctx.skel->bss->str, target, args.cmp_str_len);
+	ctx.skel->bss->str[args.cmp_str_len] = '\0';
+	/* Make bss->str < rodata->target */
+	ctx.skel->bss->str[args.cmp_str_len - 1] -= 1;
+
+	err = strncmp_bench__load(ctx.skel);
+	if (err) {
+		fprintf(stderr, "failed to load skeleton\n");
+		strncmp_bench__destroy(ctx.skel);
+		exit(1);
+	}
+}
+
+static void strncmp_attach_prog(struct bpf_program *prog)
+{
+	struct bpf_link *link;
+
+	link = bpf_program__attach(prog);
+	if (!link) {
+		fprintf(stderr, "failed to attach program!\n");
+		exit(1);
+	}
+}
+
+static void strncmp_no_helper_setup(void)
+{
+	strncmp_setup();
+	strncmp_attach_prog(ctx.skel->progs.strncmp_no_helper);
+}
+
+static void strncmp_helper_setup(void)
+{
+	strncmp_setup();
+	strncmp_attach_prog(ctx.skel->progs.strncmp_helper);
+}
+
+static void *strncmp_producer(void *ctx)
+{
+	while (true)
+		(void)syscall(__NR_getpgid);
+	return NULL;
+}
+
+static void *strncmp_consumer(void *ctx)
+{
+	return NULL;
+}
+
+static void strncmp_measure(struct bench_res *res)
+{
+	res->hits = atomic_swap(&ctx.skel->bss->hits, 0);
+}
+
+const struct bench bench_strncmp_no_helper = {
+	.name = "strncmp-no-helper",
+	.validate = strncmp_validate,
+	.setup = strncmp_no_helper_setup,
+	.producer_thread = strncmp_producer,
+	.consumer_thread = strncmp_consumer,
+	.measure = strncmp_measure,
+	.report_progress = hits_drops_report_progress,
+	.report_final = hits_drops_report_final,
+};
+
+const struct bench bench_strncmp_helper = {
+	.name = "strncmp-helper",
+	.validate = strncmp_validate,
+	.setup = strncmp_helper_setup,
+	.producer_thread = strncmp_producer,
+	.consumer_thread = strncmp_consumer,
+	.measure = strncmp_measure,
+	.report_progress = hits_drops_report_progress,
+	.report_final = hits_drops_report_final,
+};
diff --git a/tools/testing/selftests/bpf/benchs/run_bench_strncmp.sh b/tools/testing/selftests/bpf/benchs/run_bench_strncmp.sh
new file mode 100755
index 000000000000..142697284b45
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/run_bench_strncmp.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source ./benchs/run_common.sh
+
+set -eufo pipefail
+
+for s in 1 8 64 512 2048 4095; do
+	for b in no-helper helper; do
+		summarize ${b}-${s} "$($RUN_BENCH --cmp-str-len=$s strncmp-${b})"
+	done
+done
diff --git a/tools/testing/selftests/bpf/progs/strncmp_bench.c b/tools/testing/selftests/bpf/progs/strncmp_bench.c
new file mode 100644
index 000000000000..18373a7df76e
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/strncmp_bench.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2021. Huawei Technologies Co., Ltd */
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#define STRNCMP_STR_SZ 4096
+
+/* Will be updated by benchmark before program loading */
+const volatile unsigned int cmp_str_len = 1;
+const char target[STRNCMP_STR_SZ];
+
+long hits = 0;
+char str[STRNCMP_STR_SZ];
+
+char _license[] SEC("license") = "GPL";
+
+static __always_inline int local_strncmp(const char *s1, unsigned int sz,
+					 const char *s2)
+{
+	int ret = 0;
+	unsigned int i;
+
+	for (i = 0; i < sz; i++) {
+		/* E.g. 0xff > 0x31 */
+		ret = (unsigned char)s1[i] - (unsigned char)s2[i];
+		if (ret || !s1[i])
+			break;
+	}
+
+	return ret;
+}
+
+SEC("tp/syscalls/sys_enter_getpgid")
+int strncmp_no_helper(void *ctx)
+{
+	if (local_strncmp(str, cmp_str_len + 1, target) < 0)
+		__sync_add_and_fetch(&hits, 1);
+	return 0;
+}
+
+SEC("tp/syscalls/sys_enter_getpgid")
+int strncmp_helper(void *ctx)
+{
+	if (bpf_strncmp(str, cmp_str_len + 1, target) < 0)
+		__sync_add_and_fetch(&hits, 1);
+	return 0;
+}
+
-- 
cgit v1.2.3


From bdbee82beca4514496c52a2dc035f2a26f0c1b88 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 10 Dec 2021 22:16:52 +0800
Subject: selftests/bpf: Add test cases for bpf_strncmp()

Four test cases are added:
(1) ensure the return value is expected
(2) ensure no const string size is rejected
(3) ensure writable target is rejected
(4) ensure no null-terminated target is rejected

Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211210141652.877186-5-houtao1@huawei.com
---
 .../selftests/bpf/prog_tests/test_strncmp.c        | 167 +++++++++++++++++++++
 tools/testing/selftests/bpf/progs/strncmp_test.c   |  54 +++++++
 2 files changed, 221 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/test_strncmp.c
 create mode 100644 tools/testing/selftests/bpf/progs/strncmp_test.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/test_strncmp.c b/tools/testing/selftests/bpf/prog_tests/test_strncmp.c
new file mode 100644
index 000000000000..b57a3009465f
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/test_strncmp.c
@@ -0,0 +1,167 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2021. Huawei Technologies Co., Ltd */
+#include <test_progs.h>
+#include "strncmp_test.skel.h"
+
+static int trigger_strncmp(const struct strncmp_test *skel)
+{
+	int cmp;
+
+	usleep(1);
+
+	cmp = skel->bss->cmp_ret;
+	if (cmp > 0)
+		return 1;
+	if (cmp < 0)
+		return -1;
+	return 0;
+}
+
+/*
+ * Compare str and target after making str[i] != target[i].
+ * When exp is -1, make str[i] < target[i] and delta = -1.
+ */
+static void strncmp_full_str_cmp(struct strncmp_test *skel, const char *name,
+				 int exp)
+{
+	size_t nr = sizeof(skel->bss->str);
+	char *str = skel->bss->str;
+	int delta = exp;
+	int got;
+	size_t i;
+
+	memcpy(str, skel->rodata->target, nr);
+	for (i = 0; i < nr - 1; i++) {
+		str[i] += delta;
+
+		got = trigger_strncmp(skel);
+		ASSERT_EQ(got, exp, name);
+
+		str[i] -= delta;
+	}
+}
+
+static void test_strncmp_ret(void)
+{
+	struct strncmp_test *skel;
+	struct bpf_program *prog;
+	int err, got;
+
+	skel = strncmp_test__open();
+	if (!ASSERT_OK_PTR(skel, "strncmp_test open"))
+		return;
+
+	bpf_object__for_each_program(prog, skel->obj)
+		bpf_program__set_autoload(prog, false);
+
+	bpf_program__set_autoload(skel->progs.do_strncmp, true);
+
+	err = strncmp_test__load(skel);
+	if (!ASSERT_EQ(err, 0, "strncmp_test load"))
+		goto out;
+
+	err = strncmp_test__attach(skel);
+	if (!ASSERT_EQ(err, 0, "strncmp_test attach"))
+		goto out;
+
+	skel->bss->target_pid = getpid();
+
+	/* Empty str */
+	skel->bss->str[0] = '\0';
+	got = trigger_strncmp(skel);
+	ASSERT_EQ(got, -1, "strncmp: empty str");
+
+	/* Same string */
+	memcpy(skel->bss->str, skel->rodata->target, sizeof(skel->bss->str));
+	got = trigger_strncmp(skel);
+	ASSERT_EQ(got, 0, "strncmp: same str");
+
+	/* Not-null-termainted string  */
+	memcpy(skel->bss->str, skel->rodata->target, sizeof(skel->bss->str));
+	skel->bss->str[sizeof(skel->bss->str) - 1] = 'A';
+	got = trigger_strncmp(skel);
+	ASSERT_EQ(got, 1, "strncmp: not-null-term str");
+
+	strncmp_full_str_cmp(skel, "strncmp: less than", -1);
+	strncmp_full_str_cmp(skel, "strncmp: greater than", 1);
+out:
+	strncmp_test__destroy(skel);
+}
+
+static void test_strncmp_bad_not_const_str_size(void)
+{
+	struct strncmp_test *skel;
+	struct bpf_program *prog;
+	int err;
+
+	skel = strncmp_test__open();
+	if (!ASSERT_OK_PTR(skel, "strncmp_test open"))
+		return;
+
+	bpf_object__for_each_program(prog, skel->obj)
+		bpf_program__set_autoload(prog, false);
+
+	bpf_program__set_autoload(skel->progs.strncmp_bad_not_const_str_size,
+				  true);
+
+	err = strncmp_test__load(skel);
+	ASSERT_ERR(err, "strncmp_test load bad_not_const_str_size");
+
+	strncmp_test__destroy(skel);
+}
+
+static void test_strncmp_bad_writable_target(void)
+{
+	struct strncmp_test *skel;
+	struct bpf_program *prog;
+	int err;
+
+	skel = strncmp_test__open();
+	if (!ASSERT_OK_PTR(skel, "strncmp_test open"))
+		return;
+
+	bpf_object__for_each_program(prog, skel->obj)
+		bpf_program__set_autoload(prog, false);
+
+	bpf_program__set_autoload(skel->progs.strncmp_bad_writable_target,
+				  true);
+
+	err = strncmp_test__load(skel);
+	ASSERT_ERR(err, "strncmp_test load bad_writable_target");
+
+	strncmp_test__destroy(skel);
+}
+
+static void test_strncmp_bad_not_null_term_target(void)
+{
+	struct strncmp_test *skel;
+	struct bpf_program *prog;
+	int err;
+
+	skel = strncmp_test__open();
+	if (!ASSERT_OK_PTR(skel, "strncmp_test open"))
+		return;
+
+	bpf_object__for_each_program(prog, skel->obj)
+		bpf_program__set_autoload(prog, false);
+
+	bpf_program__set_autoload(skel->progs.strncmp_bad_not_null_term_target,
+				  true);
+
+	err = strncmp_test__load(skel);
+	ASSERT_ERR(err, "strncmp_test load bad_not_null_term_target");
+
+	strncmp_test__destroy(skel);
+}
+
+void test_test_strncmp(void)
+{
+	if (test__start_subtest("strncmp_ret"))
+		test_strncmp_ret();
+	if (test__start_subtest("strncmp_bad_not_const_str_size"))
+		test_strncmp_bad_not_const_str_size();
+	if (test__start_subtest("strncmp_bad_writable_target"))
+		test_strncmp_bad_writable_target();
+	if (test__start_subtest("strncmp_bad_not_null_term_target"))
+		test_strncmp_bad_not_null_term_target();
+}
diff --git a/tools/testing/selftests/bpf/progs/strncmp_test.c b/tools/testing/selftests/bpf/progs/strncmp_test.c
new file mode 100644
index 000000000000..900d930d48a8
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/strncmp_test.c
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2021. Huawei Technologies Co., Ltd */
+#include <stdbool.h>
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#define STRNCMP_STR_SZ 8
+
+const char target[STRNCMP_STR_SZ] = "EEEEEEE";
+char str[STRNCMP_STR_SZ];
+int cmp_ret = 0;
+int target_pid = 0;
+
+const char no_str_target[STRNCMP_STR_SZ] = "12345678";
+char writable_target[STRNCMP_STR_SZ];
+unsigned int no_const_str_size = STRNCMP_STR_SZ;
+
+char _license[] SEC("license") = "GPL";
+
+SEC("tp/syscalls/sys_enter_nanosleep")
+int do_strncmp(void *ctx)
+{
+	if ((bpf_get_current_pid_tgid() >> 32) != target_pid)
+		return 0;
+
+	cmp_ret = bpf_strncmp(str, STRNCMP_STR_SZ, target);
+	return 0;
+}
+
+SEC("tp/syscalls/sys_enter_nanosleep")
+int strncmp_bad_not_const_str_size(void *ctx)
+{
+	/* The value of string size is not const, so will fail */
+	cmp_ret = bpf_strncmp(str, no_const_str_size, target);
+	return 0;
+}
+
+SEC("tp/syscalls/sys_enter_nanosleep")
+int strncmp_bad_writable_target(void *ctx)
+{
+	/* Compared target is not read-only, so will fail */
+	cmp_ret = bpf_strncmp(str, STRNCMP_STR_SZ, writable_target);
+	return 0;
+}
+
+SEC("tp/syscalls/sys_enter_nanosleep")
+int strncmp_bad_not_null_term_target(void *ctx)
+{
+	/* Compared target is not null-terminated, so will fail */
+	cmp_ret = bpf_strncmp(str, STRNCMP_STR_SZ, no_str_target);
+	return 0;
+}
-- 
cgit v1.2.3


From 5aaf9efffc57ea31a13af6f0bf41e96f073ed6d5 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 10 Dec 2021 18:54:08 +0000
Subject: kselftest: alsa: Add simplistic test for ALSA mixer controls
 kselftest

Add a basic test for the mixer control interface. For every control on
every sound card in the system it checks that it can read and write the
default value where the control supports that and for writeable controls
attempts to write all valid values, restoring the default values after
each test to minimise disruption for users.

There are quite a few areas for improvement - currently no coverage of the
generation of notifications, several of the control types don't have any
coverage for the values and we don't have any testing of error handling
when we attempt to write out of range values - but this provides some basic
coverage.

This is added as a kselftest since unlike other ALSA test programs it does
not require either physical setup of the device or interactive monitoring
by users and kselftest is one of the test suites that is frequently run by
people doing general automated testing so should increase coverage. It is
written in terms of alsa-lib since tinyalsa is not generally packaged for
distributions which makes things harder for general users interested in
kselftest as a whole but it will be a barrier to people with Android.

Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
Link: https://lore.kernel.org/r/20211210185410.740009-2-broonie@kernel.org
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 MAINTAINERS                               |   8 +
 tools/testing/selftests/Makefile          |   3 +-
 tools/testing/selftests/alsa/.gitignore   |   1 +
 tools/testing/selftests/alsa/Makefile     |   9 +
 tools/testing/selftests/alsa/mixer-test.c | 605 ++++++++++++++++++++++++++++++
 5 files changed, 625 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/alsa/.gitignore
 create mode 100644 tools/testing/selftests/alsa/Makefile
 create mode 100644 tools/testing/selftests/alsa/mixer-test.c

(limited to 'tools')

diff --git a/MAINTAINERS b/MAINTAINERS
index 7a2345ce8521..8ded9f48b432 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -17805,6 +17805,7 @@ F:	Documentation/sound/
 F:	include/sound/
 F:	include/uapi/sound/
 F:	sound/
+F:	tools/testing/selftests/alsa
 
 SOUND - COMPRESSED AUDIO
 M:	Vinod Koul <vkoul@kernel.org>
@@ -17824,6 +17825,13 @@ F:	include/sound/dmaengine_pcm.h
 F:	sound/core/pcm_dmaengine.c
 F:	sound/soc/soc-generic-dmaengine-pcm.c
 
+SOUND - ALSA SELFTESTS
+M:	Mark Brown <broonie@kernel.org>
+L:	alsa-devel@alsa-project.org (moderated for non-subscribers)
+L:	linux-kselftest@vger.kernel.org
+S:	Supported
+F:	tools/testing/selftests/alsa
+
 SOUND - SOC LAYER / DYNAMIC AUDIO POWER MANAGEMENT (ASoC)
 M:	Liam Girdwood <lgirdwood@gmail.com>
 M:	Mark Brown <broonie@kernel.org>
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index c852eb40c4f7..d08fe4cfe811 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
-TARGETS = arm64
+TARGETS += alsa
+TARGETS += arm64
 TARGETS += bpf
 TARGETS += breakpoints
 TARGETS += capabilities
diff --git a/tools/testing/selftests/alsa/.gitignore b/tools/testing/selftests/alsa/.gitignore
new file mode 100644
index 000000000000..3bb7c41266a8
--- /dev/null
+++ b/tools/testing/selftests/alsa/.gitignore
@@ -0,0 +1 @@
+mixer-test
diff --git a/tools/testing/selftests/alsa/Makefile b/tools/testing/selftests/alsa/Makefile
new file mode 100644
index 000000000000..f64d9090426d
--- /dev/null
+++ b/tools/testing/selftests/alsa/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+
+CFLAGS += $(shell pkg-config --cflags alsa)
+LDLIBS += $(shell pkg-config --libs alsa)
+
+TEST_GEN_PROGS := mixer-test
+
+include ../lib.mk
diff --git a/tools/testing/selftests/alsa/mixer-test.c b/tools/testing/selftests/alsa/mixer-test.c
new file mode 100644
index 000000000000..ab51cf7b9e03
--- /dev/null
+++ b/tools/testing/selftests/alsa/mixer-test.c
@@ -0,0 +1,605 @@
+// SPDX-License-Identifier: GPL-2.0
+//
+// kselftest for the ALSA mixer API
+//
+// Original author: Mark Brown <broonie@kernel.org>
+// Copyright (c) 2021 Arm Limited
+
+// This test will iterate over all cards detected in the system, exercising
+// every mixer control it can find.  This may conflict with other system
+// software if there is audio activity so is best run on a system with a
+// minimal active userspace.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <getopt.h>
+#include <stdarg.h>
+#include <ctype.h>
+#include <math.h>
+#include <errno.h>
+#include <assert.h>
+#include <alsa/asoundlib.h>
+#include <poll.h>
+#include <stdint.h>
+
+#include "../kselftest.h"
+
+#define TESTS_PER_CONTROL 3
+
+struct card_data {
+	snd_ctl_t *handle;
+	int card;
+	int num_ctls;
+	snd_ctl_elem_list_t *ctls;
+	struct card_data *next;
+};
+
+struct ctl_data {
+	const char *name;
+	snd_ctl_elem_id_t *id;
+	snd_ctl_elem_info_t *info;
+	snd_ctl_elem_value_t *def_val;
+	int elem;
+	struct card_data *card;
+	struct ctl_data *next;
+};
+
+int num_cards = 0;
+int num_controls = 0;
+struct card_data *card_list = NULL;
+struct ctl_data *ctl_list = NULL;
+
+void find_controls(void)
+{
+	char name[32];
+	int card, ctl, err;
+	struct card_data *card_data;
+	struct ctl_data *ctl_data;
+
+	card = -1;
+	if (snd_card_next(&card) < 0 || card < 0)
+		return;
+
+	while (card >= 0) {
+		sprintf(name, "hw:%d", card);
+
+		card_data = malloc(sizeof(*card_data));
+		if (!card_data)
+			ksft_exit_fail_msg("Out of memory\n");
+
+		err = snd_ctl_open(&card_data->handle, name, 0);
+		if (err < 0) {
+			ksft_print_msg("Failed to get hctl for card %d: %s\n",
+				       card, snd_strerror(err));
+			goto next_card;
+		}
+
+		/* Count controls */
+		snd_ctl_elem_list_malloc(&card_data->ctls);
+		snd_ctl_elem_list(card_data->handle, card_data->ctls);
+		card_data->num_ctls = snd_ctl_elem_list_get_count(card_data->ctls);
+
+		/* Enumerate control information */
+		snd_ctl_elem_list_alloc_space(card_data->ctls, card_data->num_ctls);
+		snd_ctl_elem_list(card_data->handle, card_data->ctls);
+
+		card_data->card = num_cards++;
+		card_data->next = card_list;
+		card_list = card_data;
+
+		num_controls += card_data->num_ctls;
+
+		for (ctl = 0; ctl < card_data->num_ctls; ctl++) {
+			ctl_data = malloc(sizeof(*ctl_data));
+			if (!ctl_data)
+				ksft_exit_fail_msg("Out of memory\n");
+
+			ctl_data->card = card_data;
+			ctl_data->elem = ctl;
+			ctl_data->name = snd_ctl_elem_list_get_name(card_data->ctls,
+								    ctl);
+
+			err = snd_ctl_elem_id_malloc(&ctl_data->id);
+			if (err < 0)
+				ksft_exit_fail_msg("Out of memory\n");
+
+			err = snd_ctl_elem_info_malloc(&ctl_data->info);
+			if (err < 0)
+				ksft_exit_fail_msg("Out of memory\n");
+
+			err = snd_ctl_elem_value_malloc(&ctl_data->def_val);
+			if (err < 0)
+				ksft_exit_fail_msg("Out of memory\n");
+
+			snd_ctl_elem_list_get_id(card_data->ctls, ctl,
+						 ctl_data->id);
+			snd_ctl_elem_info_set_id(ctl_data->info, ctl_data->id);
+			err = snd_ctl_elem_info(card_data->handle,
+						ctl_data->info);
+			if (err < 0) {
+				ksft_print_msg("%s getting info for %d\n",
+					       snd_strerror(err),
+					       ctl_data->name);
+			}
+
+			snd_ctl_elem_value_set_id(ctl_data->def_val,
+						  ctl_data->id);
+
+			ctl_data->next = ctl_list;
+			ctl_list = ctl_data;
+		}
+
+	next_card:
+		if (snd_card_next(&card) < 0) {
+			ksft_print_msg("snd_card_next");
+			break;
+		}
+	}
+}
+
+/*
+ * Check that we can read the default value and it is valid. Write
+ * tests use the read value to restore the default.
+ */
+void test_ctl_get_value(struct ctl_data *ctl)
+{
+	int err;
+	long int_val;
+	long long int64_val;
+
+	/* If the control is turned off let's be polite */
+	if (snd_ctl_elem_info_is_inactive(ctl->info)) {
+		ksft_print_msg("%s is inactive\n", ctl->name);
+		ksft_test_result_skip("get_value.%d.%d\n",
+				      ctl->card->card, ctl->elem);
+		return;
+	}
+
+	/* Can't test reading on an unreadable control */
+	if (!snd_ctl_elem_info_is_readable(ctl->info)) {
+		ksft_print_msg("%s is not readable\n", ctl->name);
+		ksft_test_result_skip("get_value.%d.%d\n",
+				      ctl->card->card, ctl->elem);
+		return;
+	}
+
+	err = snd_ctl_elem_read(ctl->card->handle, ctl->def_val);
+	if (err < 0) {
+		ksft_print_msg("snd_ctl_elem_read() failed: %s\n",
+			       snd_strerror(err));
+		goto out;
+	}
+
+	switch (snd_ctl_elem_info_get_type(ctl->info)) {
+	case SND_CTL_ELEM_TYPE_NONE:
+		ksft_print_msg("%s Invalid control type NONE\n", ctl->name);
+		err = -1;
+		break;
+
+	case SND_CTL_ELEM_TYPE_BOOLEAN:
+		int_val = snd_ctl_elem_value_get_boolean(ctl->def_val, 0);
+		switch (int_val) {
+		case 0:
+		case 1:
+			break;
+		default:
+			ksft_print_msg("%s Invalid boolean value %ld\n",
+				       ctl->name, int_val);
+			err = -1;
+			break;
+		}
+		break;
+
+	case SND_CTL_ELEM_TYPE_INTEGER:
+		int_val = snd_ctl_elem_value_get_integer(ctl->def_val, 0);
+
+		if (int_val < snd_ctl_elem_info_get_min(ctl->info)) {
+			ksft_print_msg("%s value %ld less than minimum %ld\n",
+				       ctl->name, int_val,
+				       snd_ctl_elem_info_get_min(ctl->info));
+			err = -1;
+		}
+
+		if (int_val > snd_ctl_elem_info_get_max(ctl->info)) {
+			ksft_print_msg("%s value %ld more than maximum %ld\n",
+				       ctl->name, int_val,
+				       snd_ctl_elem_info_get_max(ctl->info));
+			err = -1;
+		}
+
+		/* Only check step size if there is one and we're in bounds */
+		if (err >= 0 && snd_ctl_elem_info_get_step(ctl->info) &&
+		    (int_val - snd_ctl_elem_info_get_min(ctl->info) %
+		     snd_ctl_elem_info_get_step(ctl->info))) {
+			ksft_print_msg("%s value %ld invalid for step %ld minimum %ld\n",
+				       ctl->name, int_val,
+				       snd_ctl_elem_info_get_step(ctl->info),
+				       snd_ctl_elem_info_get_min(ctl->info));
+			err = -1;
+		}
+		break;
+
+	case SND_CTL_ELEM_TYPE_INTEGER64:
+		int64_val = snd_ctl_elem_value_get_integer64(ctl->def_val, 0);
+
+		if (int64_val < snd_ctl_elem_info_get_min64(ctl->info)) {
+			ksft_print_msg("%s value %lld less than minimum %lld\n",
+				       ctl->name, int64_val,
+				       snd_ctl_elem_info_get_min64(ctl->info));
+			err = -1;
+		}
+
+		if (int64_val > snd_ctl_elem_info_get_max64(ctl->info)) {
+			ksft_print_msg("%s value %lld more than maximum %lld\n",
+				       ctl->name, int64_val,
+				       snd_ctl_elem_info_get_max(ctl->info));
+			err = -1;
+		}
+
+		/* Only check step size if there is one and we're in bounds */
+		if (err >= 0 && snd_ctl_elem_info_get_step64(ctl->info) &&
+		    (int64_val - snd_ctl_elem_info_get_min64(ctl->info)) %
+		    snd_ctl_elem_info_get_step64(ctl->info)) {
+			ksft_print_msg("%s value %lld invalid for step %lld minimum %lld\n",
+				       ctl->name, int64_val,
+				       snd_ctl_elem_info_get_step64(ctl->info),
+				       snd_ctl_elem_info_get_min64(ctl->info));
+			err = -1;
+		}
+		break;
+
+	default:
+		/* No tests for other types */
+		ksft_test_result_skip("get_value.%d.%d\n",
+				      ctl->card->card, ctl->elem);
+		return;
+	}
+
+out:
+	ksft_test_result(err >= 0, "get_value.%d.%d\n",
+			 ctl->card->card, ctl->elem);
+}
+
+bool show_mismatch(struct ctl_data *ctl, int index,
+		   snd_ctl_elem_value_t *read_val,
+		   snd_ctl_elem_value_t *expected_val)
+{
+	long long expected_int, read_int;
+
+	/*
+	 * We factor out the code to compare values representable as
+	 * integers, ensure that check doesn't log otherwise.
+	 */
+	expected_int = 0;
+	read_int = 0;
+
+	switch (snd_ctl_elem_info_get_type(ctl->info)) {
+	case SND_CTL_ELEM_TYPE_BOOLEAN:
+		expected_int = snd_ctl_elem_value_get_boolean(expected_val,
+							      index);
+		read_int = snd_ctl_elem_value_get_boolean(read_val, index);
+		break;
+
+	case SND_CTL_ELEM_TYPE_INTEGER:
+		expected_int = snd_ctl_elem_value_get_integer(expected_val,
+							      index);
+		read_int = snd_ctl_elem_value_get_integer(read_val, index);
+		break;
+
+	case SND_CTL_ELEM_TYPE_INTEGER64:
+		expected_int = snd_ctl_elem_value_get_integer64(expected_val,
+								index);
+		read_int = snd_ctl_elem_value_get_integer64(read_val,
+							    index);
+		break;
+
+	case SND_CTL_ELEM_TYPE_ENUMERATED:
+		expected_int = snd_ctl_elem_value_get_enumerated(expected_val,
+								 index);
+		read_int = snd_ctl_elem_value_get_enumerated(read_val,
+							     index);
+		break;
+
+	default:
+		break;
+	}
+
+	if (expected_int != read_int) {
+		ksft_print_msg("%s.%d expected %lld but read %lld\n",
+			       ctl->name, index, expected_int, read_int);
+		return true;
+	} else {
+		return false;
+	}
+}
+
+/*
+ * Write a value then if possible verify that we get the expected
+ * result.  An optional expected value can be provided if we expect
+ * the write to fail, for verifying that invalid writes don't corrupt
+ * anything.
+ */
+int write_and_verify(struct ctl_data *ctl,
+		     snd_ctl_elem_value_t *write_val,
+		     snd_ctl_elem_value_t *expected_val)
+{
+	int err, i;
+	bool error_expected, mismatch_shown;
+	snd_ctl_elem_value_t *read_val, *w_val;
+	snd_ctl_elem_value_alloca(&read_val);
+	snd_ctl_elem_value_alloca(&w_val);
+
+	/*
+	 * We need to copy the write value since writing can modify
+	 * the value which causes surprises, and allocate an expected
+	 * value if we expect to read back what we wrote.
+	 */
+	snd_ctl_elem_value_copy(w_val, write_val);
+	if (expected_val) {
+		error_expected = true;
+	} else {
+		error_expected = false;
+		snd_ctl_elem_value_alloca(&expected_val);
+		snd_ctl_elem_value_copy(expected_val, write_val);
+	}
+
+	/*
+	 * Do the write, if we have an expected value ignore the error
+	 * and carry on to validate the expected value.
+	 */
+	err = snd_ctl_elem_write(ctl->card->handle, w_val);
+	if (err < 0 && !error_expected) {
+		ksft_print_msg("snd_ctl_elem_write() failed: %s\n",
+			       snd_strerror(err));
+		return err;
+	}
+
+	/* Can we do the verification part? */
+	if (!snd_ctl_elem_info_is_readable(ctl->info))
+		return err;
+
+	snd_ctl_elem_value_set_id(read_val, ctl->id);
+
+	err = snd_ctl_elem_read(ctl->card->handle, read_val);
+	if (err < 0) {
+		ksft_print_msg("snd_ctl_elem_read() failed: %s\n",
+			       snd_strerror(err));
+		return err;
+	}
+
+	/*
+	 * Use the libray to compare values, if there's a mismatch
+	 * carry on and try to provide a more useful diagnostic than
+	 * just "mismatch".
+	 */
+	if (!snd_ctl_elem_value_compare(expected_val, read_val))
+		return 0;
+
+	mismatch_shown = false;
+	for (i = 0; i < snd_ctl_elem_info_get_count(ctl->info); i++)
+		if (show_mismatch(ctl, i, read_val, expected_val))
+			mismatch_shown = true;
+
+	if (!mismatch_shown)
+		ksft_print_msg("%s read and written values differ\n",
+			       ctl->name);
+
+	return -1;
+}
+
+/*
+ * Make sure we can write the default value back to the control, this
+ * should validate that at least some write works.
+ */
+void test_ctl_write_default(struct ctl_data *ctl)
+{
+	int err;
+
+	/* If the control is turned off let's be polite */
+	if (snd_ctl_elem_info_is_inactive(ctl->info)) {
+		ksft_print_msg("%s is inactive\n", ctl->name);
+		ksft_test_result_skip("write_default.%d.%d\n",
+				      ctl->card->card, ctl->elem);
+		return;
+	}
+
+	if (!snd_ctl_elem_info_is_writable(ctl->info)) {
+		ksft_print_msg("%s is not writeable\n", ctl->name);
+		ksft_test_result_skip("write_default.%d.%d\n",
+				      ctl->card->card, ctl->elem);
+		return;
+	}
+
+	/* No idea what the default was for unreadable controls */
+	if (!snd_ctl_elem_info_is_readable(ctl->info)) {
+		ksft_print_msg("%s couldn't read default\n", ctl->name);
+		ksft_test_result_skip("write_default.%d.%d\n",
+				      ctl->card->card, ctl->elem);
+		return;
+	}
+
+	err = write_and_verify(ctl, ctl->def_val, NULL);
+
+	ksft_test_result(err >= 0, "write_default.%d.%d\n",
+			 ctl->card->card, ctl->elem);
+}
+
+bool test_ctl_write_valid_boolean(struct ctl_data *ctl)
+{
+	int err, i, j;
+	bool fail = false;
+	snd_ctl_elem_value_t *val;
+	snd_ctl_elem_value_alloca(&val);
+
+	snd_ctl_elem_value_set_id(val, ctl->id);
+
+	for (i = 0; i < snd_ctl_elem_info_get_count(ctl->info); i++) {
+		for (j = 0; j < 2; j++) {
+			snd_ctl_elem_value_set_boolean(val, i, j);
+			err = write_and_verify(ctl, val, NULL);
+			if (err != 0)
+				fail = true;
+		}
+	}
+
+	return !fail;
+}
+
+bool test_ctl_write_valid_integer(struct ctl_data *ctl)
+{
+	int err;
+	int i;
+	long j, step;
+	bool fail = false;
+	snd_ctl_elem_value_t *val;
+	snd_ctl_elem_value_alloca(&val);
+
+	snd_ctl_elem_value_set_id(val, ctl->id);
+
+	step = snd_ctl_elem_info_get_step(ctl->info);
+	if (!step)
+		step = 1;
+
+	for (i = 0; i < snd_ctl_elem_info_get_count(ctl->info); i++) {
+		for (j = snd_ctl_elem_info_get_min(ctl->info);
+		     j <= snd_ctl_elem_info_get_max(ctl->info); j += step) {
+
+			snd_ctl_elem_value_set_integer(val, i, j);
+			err = write_and_verify(ctl, val, NULL);
+			if (err != 0)
+				fail = true;
+		}
+	}
+
+
+	return !fail;
+}
+
+bool test_ctl_write_valid_integer64(struct ctl_data *ctl)
+{
+	int err, i;
+	long long j, step;
+	bool fail = false;
+	snd_ctl_elem_value_t *val;
+	snd_ctl_elem_value_alloca(&val);
+
+	snd_ctl_elem_value_set_id(val, ctl->id);
+
+	step = snd_ctl_elem_info_get_step64(ctl->info);
+	if (!step)
+		step = 1;
+
+	for (i = 0; i < snd_ctl_elem_info_get_count(ctl->info); i++) {
+		for (j = snd_ctl_elem_info_get_min64(ctl->info);
+		     j <= snd_ctl_elem_info_get_max64(ctl->info); j += step) {
+
+			snd_ctl_elem_value_set_integer64(val, i, j);
+			err = write_and_verify(ctl, val, NULL);
+			if (err != 0)
+				fail = true;
+		}
+	}
+
+	return !fail;
+}
+
+bool test_ctl_write_valid_enumerated(struct ctl_data *ctl)
+{
+	int err, i, j;
+	bool fail = false;
+	snd_ctl_elem_value_t *val;
+	snd_ctl_elem_value_alloca(&val);
+
+	snd_ctl_elem_value_set_id(val, ctl->id);
+
+	for (i = 0; i < snd_ctl_elem_info_get_count(ctl->info); i++) {
+		for (j = 0; j < snd_ctl_elem_info_get_items(ctl->info); j++) {
+			snd_ctl_elem_value_set_enumerated(val, i, j);
+			err = write_and_verify(ctl, val, NULL);
+			if (err != 0)
+				fail = true;
+		}
+	}
+
+	return !fail;
+}
+
+void test_ctl_write_valid(struct ctl_data *ctl)
+{
+	bool pass;
+	int err;
+
+	/* If the control is turned off let's be polite */
+	if (snd_ctl_elem_info_is_inactive(ctl->info)) {
+		ksft_print_msg("%s is inactive\n", ctl->name);
+		ksft_test_result_skip("write_valid.%d.%d\n",
+				      ctl->card->card, ctl->elem);
+		return;
+	}
+
+	if (!snd_ctl_elem_info_is_writable(ctl->info)) {
+		ksft_print_msg("%s is not writeable\n", ctl->name);
+		ksft_test_result_skip("write_valid.%d.%d\n",
+				      ctl->card->card, ctl->elem);
+		return;
+	}
+
+	switch (snd_ctl_elem_info_get_type(ctl->info)) {
+	case SND_CTL_ELEM_TYPE_BOOLEAN:
+		pass = test_ctl_write_valid_boolean(ctl);
+		break;
+
+	case SND_CTL_ELEM_TYPE_INTEGER:
+		pass = test_ctl_write_valid_integer(ctl);
+		break;
+
+	case SND_CTL_ELEM_TYPE_INTEGER64:
+		pass = test_ctl_write_valid_integer64(ctl);
+		break;
+
+	case SND_CTL_ELEM_TYPE_ENUMERATED:
+		pass = test_ctl_write_valid_enumerated(ctl);
+		break;
+
+	default:
+		/* No tests for this yet */
+		ksft_test_result_skip("write_valid.%d.%d\n",
+				      ctl->card->card, ctl->elem);
+		return;
+	}
+
+	/* Restore the default value to minimise disruption */
+	err = write_and_verify(ctl, ctl->def_val, NULL);
+	if (err < 0)
+		pass = false;
+
+	ksft_test_result(pass, "write_valid.%d.%d\n",
+			 ctl->card->card, ctl->elem);
+}
+
+int main(void)
+{
+	struct ctl_data *ctl;
+
+	ksft_print_header();
+
+	find_controls();
+
+	ksft_set_plan(num_controls * TESTS_PER_CONTROL);
+
+	for (ctl = ctl_list; ctl != NULL; ctl = ctl->next) {
+		/*
+		 * Must test get_value() before we write anything, the
+		 * test stores the default value for later cleanup.
+		 */
+		test_ctl_get_value(ctl);
+		test_ctl_write_default(ctl);
+		test_ctl_write_valid(ctl);
+	}
+
+	ksft_exit_pass();
+
+	return 0;
+}
-- 
cgit v1.2.3


From 7cc994f27e84cc94ce612d201c78763f93eab2c4 Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Fri, 10 Dec 2021 18:54:09 +0000
Subject: kselftest: alsa: optimization for SNDRV_CTL_ELEM_ACCESS_VOLATILE

The volatile attribute of control element means that the hardware can
voluntarily change the state of control element independent of any
operation by software. ALSA control core necessarily sends notification
to userspace subscribers for any change from userspace application, while
it doesn't for the hardware's voluntary change.

This commit adds optimization for the attribute. Even if read value is
different from written value, the test reports success as long as the
target control element has the attribute. On the other hand, the
difference is itself reported for developers' convenience.

Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Link: https://lore.kernel.org/r/Ya7TAHdMe9i41bsC@workstation
[Fix comment style as suggested by Shuah -- broonie]
Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20211210185410.740009-3-broonie@kernel.org
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 tools/testing/selftests/alsa/mixer-test.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/alsa/mixer-test.c b/tools/testing/selftests/alsa/mixer-test.c
index ab51cf7b9e03..f65a9046e708 100644
--- a/tools/testing/selftests/alsa/mixer-test.c
+++ b/tools/testing/selftests/alsa/mixer-test.c
@@ -307,9 +307,15 @@ bool show_mismatch(struct ctl_data *ctl, int index,
 	}
 
 	if (expected_int != read_int) {
-		ksft_print_msg("%s.%d expected %lld but read %lld\n",
-			       ctl->name, index, expected_int, read_int);
-		return true;
+		/*
+		 * NOTE: The volatile attribute means that the hardware
+		 * can voluntarily change the state of control element
+		 * independent of any operation by software.  
+		 */
+		bool is_volatile = snd_ctl_elem_info_is_volatile(ctl->info);
+		ksft_print_msg("%s.%d expected %lld but read %lld, is_volatile %d\n",
+			       ctl->name, index, expected_int, read_int, is_volatile);
+		return !is_volatile;
 	} else {
 		return false;
 	}
-- 
cgit v1.2.3


From b73dad806533cad55df41a9c0349969b56d4ff7f Mon Sep 17 00:00:00 2001
From: Jaroslav Kysela <perex@perex.cz>
Date: Fri, 10 Dec 2021 18:54:10 +0000
Subject: kselftest: alsa: Use private alsa-lib configuration in mixer test

As mentined by Takashi Sakamoto, the system-wide alsa-lib configuration
may override the standard device declarations. This patch use the private
alsa-lib configuration to set the predictable environment.

Signed-off-by: Jaroslav Kysela <perex@perex.cz>
Link: https://lore.kernel.org/r/20211208095209.1772296-1-perex@perex.cz
[Restructure version test to keep the preprocessor happy -- broonie]
Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20211210185410.740009-4-broonie@kernel.org
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 tools/testing/selftests/alsa/mixer-test.c | 56 ++++++++++++++++++++++++++++++-
 1 file changed, 55 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/alsa/mixer-test.c b/tools/testing/selftests/alsa/mixer-test.c
index f65a9046e708..b798a76f6825 100644
--- a/tools/testing/selftests/alsa/mixer-test.c
+++ b/tools/testing/selftests/alsa/mixer-test.c
@@ -46,22 +46,74 @@ struct ctl_data {
 	struct ctl_data *next;
 };
 
+static const char *alsa_config =
+"ctl.hw {\n"
+"	@args [ CARD ]\n"
+"	@args.CARD.type string\n"
+"	type hw\n"
+"	card $CARD\n"
+"}\n"
+;
+
 int num_cards = 0;
 int num_controls = 0;
 struct card_data *card_list = NULL;
 struct ctl_data *ctl_list = NULL;
 
+#ifdef SND_LIB_VER
+#if SND_LIB_VERSION >= SND_LIB_VER(1, 2, 6)
+#define LIB_HAS_LOAD_STRING
+#endif
+#endif
+
+#ifndef LIB_HAS_LOAD_STRING
+int snd_config_load_string(snd_config_t **config, const char *s, size_t size)
+{
+	snd_input_t *input;
+	snd_config_t *dst;
+	int err;
+
+	assert(config && s);
+	if (size == 0)
+		size = strlen(s);
+	err = snd_input_buffer_open(&input, s, size);
+	if (err < 0)
+		return err;
+	err = snd_config_top(&dst);
+	if (err < 0) {
+		snd_input_close(input);
+		return err;
+	}
+	err = snd_config_load(dst, input);
+	snd_input_close(input);
+	if (err < 0) {
+		snd_config_delete(dst);
+		return err;
+	}
+	*config = dst;
+	return 0;
+}
+#endif
+
 void find_controls(void)
 {
 	char name[32];
 	int card, ctl, err;
 	struct card_data *card_data;
 	struct ctl_data *ctl_data;
+	snd_config_t *config;
 
 	card = -1;
 	if (snd_card_next(&card) < 0 || card < 0)
 		return;
 
+	err = snd_config_load_string(&config, alsa_config, strlen(alsa_config));
+	if (err < 0) {
+		ksft_print_msg("Unable to parse custom alsa-lib configuration: %s\n",
+			       snd_strerror(err));
+		ksft_exit_fail();
+	}
+
 	while (card >= 0) {
 		sprintf(name, "hw:%d", card);
 
@@ -69,7 +121,7 @@ void find_controls(void)
 		if (!card_data)
 			ksft_exit_fail_msg("Out of memory\n");
 
-		err = snd_ctl_open(&card_data->handle, name, 0);
+		err = snd_ctl_open_lconf(&card_data->handle, name, 0, config);
 		if (err < 0) {
 			ksft_print_msg("Failed to get hctl for card %d: %s\n",
 				       card, snd_strerror(err));
@@ -137,6 +189,8 @@ void find_controls(void)
 			break;
 		}
 	}
+
+	snd_config_delete(config);
 }
 
 /*
-- 
cgit v1.2.3


From 3748939bce3fc7a15ef07161826507fbe410bb7a Mon Sep 17 00:00:00 2001
From: Po-Hsu Lin <po-hsu.lin@canonical.com>
Date: Fri, 10 Dec 2021 15:25:23 +0800
Subject: selftests: icmp_redirect: pass xfail=0 to log_test()

If any sub-test in this icmp_redirect.sh is failing but not expected
to fail. The script will complain:
    ./icmp_redirect.sh: line 72: [: 1: unary operator expected

This is because when the sub-test is not expected to fail, we won't
pass any value for the xfail local variable in log_test() and thus
it's empty. Fix this by passing 0 as the 4th variable to log_test()
for non-xfail cases.

v2: added fixes tag

Fixes: 0a36a75c6818 ("selftests: icmp_redirect: support expected failures")
Signed-off-by: Po-Hsu Lin <po-hsu.lin@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/icmp_redirect.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/icmp_redirect.sh b/tools/testing/selftests/net/icmp_redirect.sh
index ecbf57f264ed..7b9d6e31b8e7 100755
--- a/tools/testing/selftests/net/icmp_redirect.sh
+++ b/tools/testing/selftests/net/icmp_redirect.sh
@@ -311,7 +311,7 @@ check_exception()
 		ip -netns h1 ro get ${H1_VRF_ARG} ${H2_N2_IP} | \
 		grep -E -v 'mtu|redirected' | grep -q "cache"
 	fi
-	log_test $? 0 "IPv4: ${desc}"
+	log_test $? 0 "IPv4: ${desc}" 0
 
 	# No PMTU info for test "redirect" and "mtu exception plus redirect"
 	if [ "$with_redirect" = "yes" ] && [ "$desc" != "redirect exception plus mtu" ]; then
-- 
cgit v1.2.3


From 7e0147592b5c4f9e2eb8c54a7857a56d4863f74e Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@kernel.org>
Date: Sat, 11 Dec 2021 10:11:30 -0700
Subject: selftests: Add duplicate config only for MD5 VRF tests

Commit referenced below added configuration in the default VRF that
duplicates a VRF to check MD5 passwords are properly used and fail
when expected. That config should not be added all the time as it
can cause tests to pass that should not (by matching on default VRF
setup when it should not). Move the duplicate setup to a function
that is only called for the MD5 tests and add a cleanup function
to remove it after the MD5 tests.

Fixes: 5cad8bce26e0 ("fcnal-test: Add TCP MD5 tests for VRF")
Signed-off-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/fcnal-test.sh | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh
index 29cc72d7c3d0..dd7437dd2680 100755
--- a/tools/testing/selftests/net/fcnal-test.sh
+++ b/tools/testing/selftests/net/fcnal-test.sh
@@ -455,6 +455,22 @@ cleanup()
 	ip netns del ${NSC} >/dev/null 2>&1
 }
 
+cleanup_vrf_dup()
+{
+	ip link del ${NSA_DEV2} >/dev/null 2>&1
+	ip netns pids ${NSC} | xargs kill 2>/dev/null
+	ip netns del ${NSC} >/dev/null 2>&1
+}
+
+setup_vrf_dup()
+{
+	# some VRF tests use ns-C which has the same config as
+	# ns-B but for a device NOT in the VRF
+	create_ns ${NSC} "-" "-"
+	connect_ns ${NSA} ${NSA_DEV2} ${NSA_IP}/24 ${NSA_IP6}/64 \
+		   ${NSC} ${NSC_DEV} ${NSB_IP}/24 ${NSB_IP6}/64
+}
+
 setup()
 {
 	local with_vrf=${1}
@@ -484,12 +500,6 @@ setup()
 
 		ip -netns ${NSB} ro add ${VRF_IP}/32 via ${NSA_IP} dev ${NSB_DEV}
 		ip -netns ${NSB} -6 ro add ${VRF_IP6}/128 via ${NSA_IP6} dev ${NSB_DEV}
-
-		# some VRF tests use ns-C which has the same config as
-		# ns-B but for a device NOT in the VRF
-		create_ns ${NSC} "-" "-"
-		connect_ns ${NSA} ${NSA_DEV2} ${NSA_IP}/24 ${NSA_IP6}/64 \
-			   ${NSC} ${NSC_DEV} ${NSB_IP}/24 ${NSB_IP6}/64
 	else
 		ip -netns ${NSA} ro add ${NSB_LO_IP}/32 via ${NSB_IP} dev ${NSA_DEV}
 		ip -netns ${NSA} ro add ${NSB_LO_IP6}/128 via ${NSB_IP6} dev ${NSA_DEV}
@@ -1240,7 +1250,9 @@ ipv4_tcp_vrf()
 	log_test_addr ${a} $? 1 "Global server, local connection"
 
 	# run MD5 tests
+	setup_vrf_dup
 	ipv4_tcp_md5
+	cleanup_vrf_dup
 
 	#
 	# enable VRF global server
@@ -2719,7 +2731,9 @@ ipv6_tcp_vrf()
 	log_test_addr ${a} $? 1 "Global server, local connection"
 
 	# run MD5 tests
+	setup_vrf_dup
 	ipv6_tcp_md5
+	cleanup_vrf_dup
 
 	#
 	# enable VRF global server
-- 
cgit v1.2.3


From 0f108ae4452025fef529671998f6c7f1c4526790 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@kernel.org>
Date: Sat, 11 Dec 2021 10:21:08 -0700
Subject: selftests: Fix raw socket bind tests with VRF

Commit referenced below added negative socket bind tests for VRF. The
socket binds should fail since the address to bind to is in a VRF yet
the socket is not bound to the VRF or a device within it. Update the
expected return code to check for 1 (bind failure) so the test passes
when the bind fails as expected. Add a 'show_hint' comment to explain
why the bind is expected to fail.

Fixes: 75b2b2b3db4c ("selftests: Add ipv4 address bind tests to fcnal-test")
Reported-by: Li Zhijian <lizhijian@fujitsu.com>
Signed-off-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/fcnal-test.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh
index dd7437dd2680..4340477863d3 100755
--- a/tools/testing/selftests/net/fcnal-test.sh
+++ b/tools/testing/selftests/net/fcnal-test.sh
@@ -1810,8 +1810,9 @@ ipv4_addr_bind_vrf()
 	for a in ${NSA_IP} ${VRF_IP}
 	do
 		log_start
+		show_hint "Socket not bound to VRF, but address is in VRF"
 		run_cmd nettest -s -R -P icmp -l ${a} -b
-		log_test_addr ${a} $? 0 "Raw socket bind to local address"
+		log_test_addr ${a} $? 1 "Raw socket bind to local address"
 
 		log_start
 		run_cmd nettest -s -R -P icmp -l ${a} -I ${NSA_DEV} -b
-- 
cgit v1.2.3


From 28a2686c185e84b6aa6a4d9c9a972360eb7ca266 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@kernel.org>
Date: Sat, 11 Dec 2021 11:26:16 -0700
Subject: selftests: Fix IPv6 address bind tests

IPv6 allows binding a socket to a device then binding to an address
not on the device (__inet6_bind -> ipv6_chk_addr with strict flag
not set). Update the bind tests to reflect legacy behavior.

Fixes: 34d0302ab861 ("selftests: Add ipv6 address bind tests to fcnal-test")
Reported-by: Li Zhijian <lizhijian@fujitsu.com>
Signed-off-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/fcnal-test.sh | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh
index 4340477863d3..ad2982b72e02 100755
--- a/tools/testing/selftests/net/fcnal-test.sh
+++ b/tools/testing/selftests/net/fcnal-test.sh
@@ -3429,11 +3429,14 @@ ipv6_addr_bind_novrf()
 	run_cmd nettest -6 -s -l ${a} -I ${NSA_DEV} -t1 -b
 	log_test_addr ${a} $? 0 "TCP socket bind to local address after device bind"
 
+	# Sadly, the kernel allows binding a socket to a device and then
+	# binding to an address not on the device. So this test passes
+	# when it really should not
 	a=${NSA_LO_IP6}
 	log_start
-	show_hint "Should fail with 'Cannot assign requested address'"
+	show_hint "Tecnically should fail since address is not on device but kernel allows"
 	run_cmd nettest -6 -s -l ${a} -I ${NSA_DEV} -t1 -b
-	log_test_addr ${a} $? 1 "TCP socket bind to out of scope local address"
+	log_test_addr ${a} $? 0 "TCP socket bind to out of scope local address"
 }
 
 ipv6_addr_bind_vrf()
@@ -3474,10 +3477,15 @@ ipv6_addr_bind_vrf()
 	run_cmd nettest -6 -s -l ${a} -I ${NSA_DEV} -t1 -b
 	log_test_addr ${a} $? 0 "TCP socket bind to local address with device bind"
 
+	# Sadly, the kernel allows binding a socket to a device and then
+	# binding to an address not on the device. The only restriction
+	# is that the address is valid in the L3 domain. So this test
+	# passes when it really should not
 	a=${VRF_IP6}
 	log_start
+	show_hint "Tecnically should fail since address is not on device but kernel allows"
 	run_cmd nettest -6 -s -l ${a} -I ${NSA_DEV} -t1 -b
-	log_test_addr ${a} $? 1 "TCP socket bind to VRF address with device bind"
+	log_test_addr ${a} $? 0 "TCP socket bind to VRF address with device bind"
 
 	a=${NSA_LO_IP6}
 	log_start
-- 
cgit v1.2.3


From f12468828c28ff90d20c99b234a94223401f7924 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sun, 12 Dec 2021 11:13:41 -0800
Subject: selftests/bpf: Remove last bpf_create_map_xattr from test_verifier

bpf_create_map_xattr() call was reintroduced after merging bpf tree into
bpf-next tree. Convert the last instance into bpf_map_create() call.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211212191341.2529573-1-andrii@kernel.org
---
 tools/testing/selftests/bpf/test_verifier.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 16ce2ad097f4..ad5d30bafd93 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -701,22 +701,18 @@ static int create_sk_storage_map(void)
 
 static int create_map_timer(void)
 {
-	struct bpf_create_map_attr attr = {
-		.name = "test_map",
-		.map_type = BPF_MAP_TYPE_ARRAY,
-		.key_size = 4,
-		.value_size = 16,
-		.max_entries = 1,
+	LIBBPF_OPTS(bpf_map_create_opts, opts,
 		.btf_key_type_id = 1,
 		.btf_value_type_id = 5,
-	};
+	);
 	int fd, btf_fd;
 
 	btf_fd = load_btf();
 	if (btf_fd < 0)
 		return -1;
-	attr.btf_fd = btf_fd;
-	fd = bpf_create_map_xattr(&attr);
+
+	opts.btf_fd = btf_fd;
+	fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "test_map", 4, 16, 1, &opts);
 	if (fd < 0)
 		printf("Failed to create map with timer\n");
 	return fd;
-- 
cgit v1.2.3


From 4b443bc1785f28df56fdbd6a107dc68ef7d5aa8e Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sun, 12 Dec 2021 17:07:06 -0800
Subject: libbpf: Don't validate TYPE_ID relo's original imm value

During linking, type IDs in the resulting linked BPF object file can
change, and so ldimm64 instructions corresponding to
BPF_CORE_TYPE_ID_TARGET and BPF_CORE_TYPE_ID_LOCAL CO-RE relos can get
their imm value out of sync with actual CO-RE relocation information
that's updated by BPF linker properly during linking process.

We could teach BPF linker to adjust such instructions, but it feels
a bit too much for linker to re-implement good chunk of
bpf_core_patch_insns logic just for this. This is a redundant safety
check for TYPE_ID relocations, as the real validation is in matching
CO-RE specs, so if that works fine, it's very unlikely that there is
something wrong with the instruction itself.

So, instead, teach libbpf (and kernel) to ignore insn->imm for
BPF_CORE_TYPE_ID_TARGET and BPF_CORE_TYPE_ID_LOCAL relos.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211213010706.100231-1-andrii@kernel.org
---
 tools/lib/bpf/relo_core.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/relo_core.c b/tools/lib/bpf/relo_core.c
index 32464f0ab4b1..c770483b4c36 100644
--- a/tools/lib/bpf/relo_core.c
+++ b/tools/lib/bpf/relo_core.c
@@ -709,10 +709,14 @@ static int bpf_core_calc_field_relo(const char *prog_name,
 
 static int bpf_core_calc_type_relo(const struct bpf_core_relo *relo,
 				   const struct bpf_core_spec *spec,
-				   __u32 *val)
+				   __u32 *val, bool *validate)
 {
 	__s64 sz;
 
+	/* by default, always check expected value in bpf_insn */
+	if (validate)
+		*validate = true;
+
 	/* type-based relos return zero when target type is not found */
 	if (!spec) {
 		*val = 0;
@@ -722,6 +726,11 @@ static int bpf_core_calc_type_relo(const struct bpf_core_relo *relo,
 	switch (relo->kind) {
 	case BPF_CORE_TYPE_ID_TARGET:
 		*val = spec->root_type_id;
+		/* type ID, embedded in bpf_insn, might change during linking,
+		 * so enforcing it is pointless
+		 */
+		if (validate)
+			*validate = false;
 		break;
 	case BPF_CORE_TYPE_EXISTS:
 		*val = 1;
@@ -861,8 +870,8 @@ static int bpf_core_calc_relo(const char *prog_name,
 			res->fail_memsz_adjust = true;
 		}
 	} else if (core_relo_is_type_based(relo->kind)) {
-		err = bpf_core_calc_type_relo(relo, local_spec, &res->orig_val);
-		err = err ?: bpf_core_calc_type_relo(relo, targ_spec, &res->new_val);
+		err = bpf_core_calc_type_relo(relo, local_spec, &res->orig_val, &res->validate);
+		err = err ?: bpf_core_calc_type_relo(relo, targ_spec, &res->new_val, NULL);
 	} else if (core_relo_is_enumval_based(relo->kind)) {
 		err = bpf_core_calc_enumval_relo(relo, local_spec, &res->orig_val);
 		err = err ?: bpf_core_calc_enumval_relo(relo, targ_spec, &res->new_val);
@@ -1213,7 +1222,8 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn,
 
 	/* TYPE_ID_LOCAL relo is special and doesn't need candidate search */
 	if (relo->kind == BPF_CORE_TYPE_ID_LOCAL) {
-		targ_res.validate = true;
+		/* bpf_insn's imm value could get out of sync during linking */
+		targ_res.validate = false;
 		targ_res.poison = false;
 		targ_res.orig_val = local_spec->root_type_id;
 		targ_res.new_val = local_spec->root_type_id;
@@ -1227,7 +1237,6 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn,
 		return -EOPNOTSUPP;
 	}
 
-
 	for (i = 0, j = 0; i < cands->len; i++) {
 		err = bpf_core_spec_match(local_spec, cands->cands[i].btf,
 					  cands->cands[i].id, cand_spec);
-- 
cgit v1.2.3


From a8d13611b4a7b1b20d17bf2b9a89a3efcabde56c Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Sat, 11 Dec 2021 14:30:31 -0500
Subject: selftests/net: toeplitz: fix udp option

Tiny fix. Option -u ("use udp") does not take an argument.

It can cause the next argument to silently be ignored.

Fixes: 5ebfb4cc3048 ("selftests/net: toeplitz test")
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/toeplitz.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/toeplitz.c b/tools/testing/selftests/net/toeplitz.c
index 710ac956bdb3..c5489341cfb8 100644
--- a/tools/testing/selftests/net/toeplitz.c
+++ b/tools/testing/selftests/net/toeplitz.c
@@ -498,7 +498,7 @@ static void parse_opts(int argc, char **argv)
 	bool have_toeplitz = false;
 	int index, c;
 
-	while ((c = getopt_long(argc, argv, "46C:d:i:k:r:stT:u:v", long_options, &index)) != -1) {
+	while ((c = getopt_long(argc, argv, "46C:d:i:k:r:stT:uv", long_options, &index)) != -1) {
 		switch (c) {
 		case '4':
 			cfg_family = AF_INET;
-- 
cgit v1.2.3


From 87f7282e76bef97d775a7cd61dc1988fc5ed8639 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Sat, 11 Dec 2021 15:04:57 -0500
Subject: selftests/net: expand gro with two machine test

The test is currently run on a single host with private addresses,
either over veth or by setting a nic in loopback mode with macvlan.

Support running between two real devices. Allow overriding addresses.

Also cut timeout to fail faster on error and explicitly log success.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/gro.c | 38 ++++++++++++++++++++++++--------------
 1 file changed, 24 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/gro.c b/tools/testing/selftests/net/gro.c
index cf37ce86b0fd..ee0b03f3dd06 100644
--- a/tools/testing/selftests/net/gro.c
+++ b/tools/testing/selftests/net/gro.c
@@ -64,10 +64,6 @@
 #define NUM_PACKETS 4
 #define START_SEQ 100
 #define START_ACK 100
-#define SIP6 "fdaa::2"
-#define DIP6 "fdaa::1"
-#define SIP4 "192.168.1.200"
-#define DIP4 "192.168.1.100"
 #define ETH_P_NONE 0
 #define TOTAL_HDR_LEN (ETH_HLEN + sizeof(struct ipv6hdr) + sizeof(struct tcphdr))
 #define MSS (4096 - sizeof(struct tcphdr) - sizeof(struct ipv6hdr))
@@ -75,6 +71,10 @@
 #define NUM_LARGE_PKT (MAX_PAYLOAD / MSS)
 #define MAX_HDR_LEN (ETH_HLEN + sizeof(struct ipv6hdr) + sizeof(struct tcphdr))
 
+static const char *addr6_src = "fdaa::2";
+static const char *addr6_dst = "fdaa::1";
+static const char *addr4_src = "192.168.1.200";
+static const char *addr4_dst = "192.168.1.100";
 static int proto = -1;
 static uint8_t src_mac[ETH_ALEN], dst_mac[ETH_ALEN];
 static char *testname = "data";
@@ -178,18 +178,18 @@ static uint16_t tcp_checksum(void *buf, int payload_len)
 	uint32_t sum = 0;
 
 	if (proto == PF_INET6) {
-		if (inet_pton(AF_INET6, SIP6, &ph6.saddr) != 1)
+		if (inet_pton(AF_INET6, addr6_src, &ph6.saddr) != 1)
 			error(1, errno, "inet_pton6 source ip pseudo");
-		if (inet_pton(AF_INET6, DIP6, &ph6.daddr) != 1)
+		if (inet_pton(AF_INET6, addr6_dst, &ph6.daddr) != 1)
 			error(1, errno, "inet_pton6 dest ip pseudo");
 		ph6.protocol = htons(IPPROTO_TCP);
 		ph6.payload_len = htons(sizeof(struct tcphdr) + payload_len);
 
 		sum = checksum_nofold(&ph6, sizeof(ph6), 0);
 	} else if (proto == PF_INET) {
-		if (inet_pton(AF_INET, SIP4, &ph4.saddr) != 1)
+		if (inet_pton(AF_INET, addr4_src, &ph4.saddr) != 1)
 			error(1, errno, "inet_pton source ip pseudo");
-		if (inet_pton(AF_INET, DIP4, &ph4.daddr) != 1)
+		if (inet_pton(AF_INET, addr4_dst, &ph4.daddr) != 1)
 			error(1, errno, "inet_pton dest ip pseudo");
 		ph4.protocol = htons(IPPROTO_TCP);
 		ph4.payload_len = htons(sizeof(struct tcphdr) + payload_len);
@@ -229,9 +229,9 @@ static void fill_networklayer(void *buf, int payload_len)
 		ip6h->payload_len = htons(sizeof(struct tcphdr) + payload_len);
 		ip6h->nexthdr = IPPROTO_TCP;
 		ip6h->hop_limit = 8;
-		if (inet_pton(AF_INET6, SIP6, &ip6h->saddr) != 1)
+		if (inet_pton(AF_INET6, addr6_src, &ip6h->saddr) != 1)
 			error(1, errno, "inet_pton source ip6");
-		if (inet_pton(AF_INET6, DIP6, &ip6h->daddr) != 1)
+		if (inet_pton(AF_INET6, addr6_dst, &ip6h->daddr) != 1)
 			error(1, errno, "inet_pton dest ip6");
 	} else if (proto == PF_INET) {
 		memset(iph, 0, sizeof(*iph));
@@ -243,9 +243,9 @@ static void fill_networklayer(void *buf, int payload_len)
 		iph->tot_len = htons(sizeof(struct tcphdr) +
 				payload_len + sizeof(struct iphdr));
 		iph->frag_off = htons(0x4000); /* DF = 1, MF = 0 */
-		if (inet_pton(AF_INET, SIP4, &iph->saddr) != 1)
+		if (inet_pton(AF_INET, addr4_src, &iph->saddr) != 1)
 			error(1, errno, "inet_pton source ip");
-		if (inet_pton(AF_INET, DIP4, &iph->daddr) != 1)
+		if (inet_pton(AF_INET, addr4_dst, &iph->daddr) != 1)
 			error(1, errno, "inet_pton dest ip");
 		iph->check = checksum_fold(buf, sizeof(struct iphdr), 0);
 	}
@@ -731,7 +731,7 @@ static void set_timeout(int fd)
 {
 	struct timeval timeout;
 
-	timeout.tv_sec = 120;
+	timeout.tv_sec = 3;
 	timeout.tv_usec = 0;
 	if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout,
 		       sizeof(timeout)) < 0)
@@ -1023,11 +1023,13 @@ static void gro_receiver(void)
 static void parse_args(int argc, char **argv)
 {
 	static const struct option opts[] = {
+		{ "daddr", required_argument, NULL, 'd' },
 		{ "dmac", required_argument, NULL, 'D' },
 		{ "iface", required_argument, NULL, 'i' },
 		{ "ipv4", no_argument, NULL, '4' },
 		{ "ipv6", no_argument, NULL, '6' },
 		{ "rx", no_argument, NULL, 'r' },
+		{ "saddr", required_argument, NULL, 's' },
 		{ "smac", required_argument, NULL, 'S' },
 		{ "test", required_argument, NULL, 't' },
 		{ "verbose", no_argument, NULL, 'v' },
@@ -1035,7 +1037,7 @@ static void parse_args(int argc, char **argv)
 	};
 	int c;
 
-	while ((c = getopt_long(argc, argv, "46D:i:rS:t:v", opts, NULL)) != -1) {
+	while ((c = getopt_long(argc, argv, "46d:D:i:rs:S:t:v", opts, NULL)) != -1) {
 		switch (c) {
 		case '4':
 			proto = PF_INET;
@@ -1045,6 +1047,9 @@ static void parse_args(int argc, char **argv)
 			proto = PF_INET6;
 			ethhdr_proto = htons(ETH_P_IPV6);
 			break;
+		case 'd':
+			addr4_dst = addr6_dst = optarg;
+			break;
 		case 'D':
 			dmac = optarg;
 			break;
@@ -1054,6 +1059,9 @@ static void parse_args(int argc, char **argv)
 		case 'r':
 			tx_socket = false;
 			break;
+		case 's':
+			addr4_src = addr6_src = optarg;
+			break;
 		case 'S':
 			smac = optarg;
 			break;
@@ -1091,5 +1099,7 @@ int main(int argc, char **argv)
 		gro_sender();
 	else
 		gro_receiver();
+
+	fprintf(stderr, "Gro::%s test passed.\n", testname);
 	return 0;
 }
-- 
cgit v1.2.3


From 71da1aec215290e249d09c44c768df859f3a3bba Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Mon, 13 Dec 2021 16:36:00 +0800
Subject: selftest/net/forwarding: declare NETIFS p9 p10

The recent GRE selftests defined NUM_NETIFS=10. If the users copy
forwarding.config.sample to forwarding.config directly, they will get
error "Command line is not complete" when run the GRE tests, because
create_netif_veth() failed with no interface name defined.

Fix it by extending the NETIFS with p9 and p10.

Fixes: 2800f2485417 ("selftests: forwarding: Test multipath hashing on inner IP pkts for GRE tunnel")
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/forwarding/forwarding.config.sample | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/forwarding.config.sample b/tools/testing/selftests/net/forwarding/forwarding.config.sample
index bf17e485684f..b0980a2efa31 100644
--- a/tools/testing/selftests/net/forwarding/forwarding.config.sample
+++ b/tools/testing/selftests/net/forwarding/forwarding.config.sample
@@ -13,6 +13,8 @@ NETIFS[p5]=veth4
 NETIFS[p6]=veth5
 NETIFS[p7]=veth6
 NETIFS[p8]=veth7
+NETIFS[p9]=veth8
+NETIFS[p10]=veth9
 
 # Port that does not have a cable connected.
 NETIF_NO_CABLE=eth8
-- 
cgit v1.2.3


From 2b070c2bc885977ca8fe76ba8f6b9d73d8d20e39 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Wed, 8 Dec 2021 20:32:42 +0100
Subject: selftests/bpf: Add test to access int ptr argument in tracing program

Adding verifier test for accessing int pointer argument in
tracing programs.

The test program loads 2nd argument of bpf_modify_return_test
function which is int pointer and checks that verifier allows
that.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211208193245.172141-3-jolsa@kernel.org
---
 tools/testing/selftests/bpf/verifier/btf_ctx_access.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/verifier/btf_ctx_access.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/verifier/btf_ctx_access.c b/tools/testing/selftests/bpf/verifier/btf_ctx_access.c
new file mode 100644
index 000000000000..6340db6b46dc
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/btf_ctx_access.c
@@ -0,0 +1,12 @@
+{
+	"btf_ctx_access accept",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 8),	/* load 2nd argument value (int pointer) */
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.prog_type = BPF_PROG_TYPE_TRACING,
+	.expected_attach_type = BPF_TRACE_FENTRY,
+	.kfunc = "bpf_modify_return_test",
+},
-- 
cgit v1.2.3


From f92c1e183604c20ce00eb889315fdaa8f2d9e509 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Wed, 8 Dec 2021 20:32:44 +0100
Subject: bpf: Add get_func_[arg|ret|arg_cnt] helpers

Adding following helpers for tracing programs:

Get n-th argument of the traced function:
  long bpf_get_func_arg(void *ctx, u32 n, u64 *value)

Get return value of the traced function:
  long bpf_get_func_ret(void *ctx, u64 *value)

Get arguments count of the traced function:
  long bpf_get_func_arg_cnt(void *ctx)

The trampoline now stores number of arguments on ctx-8
address, so it's easy to verify argument index and find
return value argument's position.

Moving function ip address on the trampoline stack behind
the number of functions arguments, so it's now stored on
ctx-16 address if it's needed.

All helpers above are inlined by verifier.

Also bit unrelated small change - using newly added function
bpf_prog_has_trampoline in check_get_func_ip.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211208193245.172141-5-jolsa@kernel.org
---
 arch/x86/net/bpf_jit_comp.c    | 15 +++++++-
 include/linux/bpf.h            |  5 +++
 include/uapi/linux/bpf.h       | 28 +++++++++++++++
 kernel/bpf/trampoline.c        |  8 +++++
 kernel/bpf/verifier.c          | 77 +++++++++++++++++++++++++++++++++++++++---
 kernel/trace/bpf_trace.c       | 55 +++++++++++++++++++++++++++++-
 tools/include/uapi/linux/bpf.h | 28 +++++++++++++++
 7 files changed, 209 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 10fab8cb3fb5..4bbcded07415 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1941,7 +1941,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 				void *orig_call)
 {
 	int ret, i, nr_args = m->nr_args;
-	int regs_off, ip_off, stack_size = nr_args * 8;
+	int regs_off, ip_off, args_off, stack_size = nr_args * 8;
 	struct bpf_tramp_progs *fentry = &tprogs[BPF_TRAMP_FENTRY];
 	struct bpf_tramp_progs *fexit = &tprogs[BPF_TRAMP_FEXIT];
 	struct bpf_tramp_progs *fmod_ret = &tprogs[BPF_TRAMP_MODIFY_RETURN];
@@ -1968,6 +1968,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 	 *                 [ ...             ]
 	 * RBP - regs_off  [ reg_arg1        ]  program's ctx pointer
 	 *
+	 * RBP - args_off  [ args count      ]  always
+	 *
 	 * RBP - ip_off    [ traced function ]  BPF_TRAMP_F_IP_ARG flag
 	 */
 
@@ -1978,6 +1980,10 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 
 	regs_off = stack_size;
 
+	/* args count  */
+	stack_size += 8;
+	args_off = stack_size;
+
 	if (flags & BPF_TRAMP_F_IP_ARG)
 		stack_size += 8; /* room for IP address argument */
 
@@ -1996,6 +2002,13 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 	EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */
 	EMIT1(0x53);		 /* push rbx */
 
+	/* Store number of arguments of the traced function:
+	 *   mov rax, nr_args
+	 *   mov QWORD PTR [rbp - args_off], rax
+	 */
+	emit_mov_imm64(&prog, BPF_REG_0, 0, (u32) nr_args);
+	emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -args_off);
+
 	if (flags & BPF_TRAMP_F_IP_ARG) {
 		/* Store IP address of the traced function:
 		 * mov rax, QWORD PTR [rbp + 8]
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7a40022e3d00..965fffaf0308 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -777,6 +777,7 @@ void bpf_ksym_add(struct bpf_ksym *ksym);
 void bpf_ksym_del(struct bpf_ksym *ksym);
 int bpf_jit_charge_modmem(u32 pages);
 void bpf_jit_uncharge_modmem(u32 pages);
+bool bpf_prog_has_trampoline(const struct bpf_prog *prog);
 #else
 static inline int bpf_trampoline_link_prog(struct bpf_prog *prog,
 					   struct bpf_trampoline *tr)
@@ -805,6 +806,10 @@ static inline bool is_bpf_image_address(unsigned long address)
 {
 	return false;
 }
+static inline bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
+{
+	return false;
+}
 #endif
 
 struct bpf_func_info_aux {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2820c77e4846..b0383d371b9a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4993,6 +4993,31 @@ union bpf_attr {
  *		An integer less than, equal to, or greater than zero
  *		if the first **s1_sz** bytes of **s1** is found to be
  *		less than, to match, or be greater than **s2**.
+ *
+ * long bpf_get_func_arg(void *ctx, u32 n, u64 *value)
+ *	Description
+ *		Get **n**-th argument (zero based) of the traced function (for tracing programs)
+ *		returned in **value**.
+ *
+ *	Return
+ *		0 on success.
+ *		**-EINVAL** if n >= arguments count of traced function.
+ *
+ * long bpf_get_func_ret(void *ctx, u64 *value)
+ *	Description
+ *		Get return value of the traced function (for tracing programs)
+ *		in **value**.
+ *
+ *	Return
+ *		0 on success.
+ *		**-EOPNOTSUPP** for tracing programs other than BPF_TRACE_FEXIT or BPF_MODIFY_RETURN.
+ *
+ * long bpf_get_func_arg_cnt(void *ctx)
+ *	Description
+ *		Get number of arguments of the traced function (for tracing programs).
+ *
+ *	Return
+ *		The number of arguments of the traced function.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5178,6 +5203,9 @@ union bpf_attr {
 	FN(find_vma),			\
 	FN(loop),			\
 	FN(strncmp),			\
+	FN(get_func_arg),		\
+	FN(get_func_ret),		\
+	FN(get_func_arg_cnt),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index e98de5e73ba5..4b6974a195c1 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -27,6 +27,14 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
 /* serializes access to trampoline_table */
 static DEFINE_MUTEX(trampoline_mutex);
 
+bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
+{
+	enum bpf_attach_type eatype = prog->expected_attach_type;
+
+	return eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
+	       eatype == BPF_MODIFY_RETURN;
+}
+
 void *bpf_jit_alloc_exec_page(void)
 {
 	void *image;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b39de3ae50f5..d74e8a99412e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6395,13 +6395,11 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
 
 static int check_get_func_ip(struct bpf_verifier_env *env)
 {
-	enum bpf_attach_type eatype = env->prog->expected_attach_type;
 	enum bpf_prog_type type = resolve_prog_type(env->prog);
 	int func_id = BPF_FUNC_get_func_ip;
 
 	if (type == BPF_PROG_TYPE_TRACING) {
-		if (eatype != BPF_TRACE_FENTRY && eatype != BPF_TRACE_FEXIT &&
-		    eatype != BPF_MODIFY_RETURN) {
+		if (!bpf_prog_has_trampoline(env->prog)) {
 			verbose(env, "func %s#%d supported only for fentry/fexit/fmod_ret programs\n",
 				func_id_name(func_id), func_id);
 			return -ENOTSUPP;
@@ -12997,6 +12995,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env,
 static int do_misc_fixups(struct bpf_verifier_env *env)
 {
 	struct bpf_prog *prog = env->prog;
+	enum bpf_attach_type eatype = prog->expected_attach_type;
 	bool expect_blinding = bpf_jit_blinding_enabled(prog);
 	enum bpf_prog_type prog_type = resolve_prog_type(prog);
 	struct bpf_insn *insn = prog->insnsi;
@@ -13367,11 +13366,79 @@ patch_map_ops_generic:
 			continue;
 		}
 
+		/* Implement bpf_get_func_arg inline. */
+		if (prog_type == BPF_PROG_TYPE_TRACING &&
+		    insn->imm == BPF_FUNC_get_func_arg) {
+			/* Load nr_args from ctx - 8 */
+			insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+			insn_buf[1] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6);
+			insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3);
+			insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1);
+			insn_buf[4] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0);
+			insn_buf[5] = BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
+			insn_buf[6] = BPF_MOV64_IMM(BPF_REG_0, 0);
+			insn_buf[7] = BPF_JMP_A(1);
+			insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
+			cnt = 9;
+
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta    += cnt - 1;
+			env->prog = prog = new_prog;
+			insn      = new_prog->insnsi + i + delta;
+			continue;
+		}
+
+		/* Implement bpf_get_func_ret inline. */
+		if (prog_type == BPF_PROG_TYPE_TRACING &&
+		    insn->imm == BPF_FUNC_get_func_ret) {
+			if (eatype == BPF_TRACE_FEXIT ||
+			    eatype == BPF_MODIFY_RETURN) {
+				/* Load nr_args from ctx - 8 */
+				insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+				insn_buf[1] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
+				insn_buf[2] = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1);
+				insn_buf[3] = BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
+				insn_buf[4] = BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0);
+				insn_buf[5] = BPF_MOV64_IMM(BPF_REG_0, 0);
+				cnt = 6;
+			} else {
+				insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, -EOPNOTSUPP);
+				cnt = 1;
+			}
+
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta    += cnt - 1;
+			env->prog = prog = new_prog;
+			insn      = new_prog->insnsi + i + delta;
+			continue;
+		}
+
+		/* Implement get_func_arg_cnt inline. */
+		if (prog_type == BPF_PROG_TYPE_TRACING &&
+		    insn->imm == BPF_FUNC_get_func_arg_cnt) {
+			/* Load nr_args from ctx - 8 */
+			insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
+			if (!new_prog)
+				return -ENOMEM;
+
+			env->prog = prog = new_prog;
+			insn      = new_prog->insnsi + i + delta;
+			continue;
+		}
+
 		/* Implement bpf_get_func_ip inline. */
 		if (prog_type == BPF_PROG_TYPE_TRACING &&
 		    insn->imm == BPF_FUNC_get_func_ip) {
-			/* Load IP address from ctx - 8 */
-			insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+			/* Load IP address from ctx - 16 */
+			insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -16);
 
 			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
 			if (!new_prog)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 623dd0684429..cea2ca6df949 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1012,7 +1012,7 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = {
 BPF_CALL_1(bpf_get_func_ip_tracing, void *, ctx)
 {
 	/* This helper call is inlined by verifier. */
-	return ((u64 *)ctx)[-1];
+	return ((u64 *)ctx)[-2];
 }
 
 static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = {
@@ -1091,6 +1091,53 @@ static const struct bpf_func_proto bpf_get_branch_snapshot_proto = {
 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
+BPF_CALL_3(get_func_arg, void *, ctx, u32, n, u64 *, value)
+{
+	/* This helper call is inlined by verifier. */
+	u64 nr_args = ((u64 *)ctx)[-1];
+
+	if ((u64) n >= nr_args)
+		return -EINVAL;
+	*value = ((u64 *)ctx)[n];
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_get_func_arg_proto = {
+	.func		= get_func_arg,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_LONG,
+};
+
+BPF_CALL_2(get_func_ret, void *, ctx, u64 *, value)
+{
+	/* This helper call is inlined by verifier. */
+	u64 nr_args = ((u64 *)ctx)[-1];
+
+	*value = ((u64 *)ctx)[nr_args];
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_get_func_ret_proto = {
+	.func		= get_func_ret,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_LONG,
+};
+
+BPF_CALL_1(get_func_arg_cnt, void *, ctx)
+{
+	/* This helper call is inlined by verifier. */
+	return ((u64 *)ctx)[-1];
+}
+
+static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = {
+	.func		= get_func_arg_cnt,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+};
+
 static const struct bpf_func_proto *
 bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -1629,6 +1676,12 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		       NULL;
 	case BPF_FUNC_d_path:
 		return &bpf_d_path_proto;
+	case BPF_FUNC_get_func_arg:
+		return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_proto : NULL;
+	case BPF_FUNC_get_func_ret:
+		return bpf_prog_has_trampoline(prog) ? &bpf_get_func_ret_proto : NULL;
+	case BPF_FUNC_get_func_arg_cnt:
+		return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL;
 	default:
 		fn = raw_tp_prog_func_proto(func_id, prog);
 		if (!fn && prog->expected_attach_type == BPF_TRACE_ITER)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 2820c77e4846..b0383d371b9a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4993,6 +4993,31 @@ union bpf_attr {
  *		An integer less than, equal to, or greater than zero
  *		if the first **s1_sz** bytes of **s1** is found to be
  *		less than, to match, or be greater than **s2**.
+ *
+ * long bpf_get_func_arg(void *ctx, u32 n, u64 *value)
+ *	Description
+ *		Get **n**-th argument (zero based) of the traced function (for tracing programs)
+ *		returned in **value**.
+ *
+ *	Return
+ *		0 on success.
+ *		**-EINVAL** if n >= arguments count of traced function.
+ *
+ * long bpf_get_func_ret(void *ctx, u64 *value)
+ *	Description
+ *		Get return value of the traced function (for tracing programs)
+ *		in **value**.
+ *
+ *	Return
+ *		0 on success.
+ *		**-EOPNOTSUPP** for tracing programs other than BPF_TRACE_FEXIT or BPF_MODIFY_RETURN.
+ *
+ * long bpf_get_func_arg_cnt(void *ctx)
+ *	Description
+ *		Get number of arguments of the traced function (for tracing programs).
+ *
+ *	Return
+ *		The number of arguments of the traced function.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5178,6 +5203,9 @@ union bpf_attr {
 	FN(find_vma),			\
 	FN(loop),			\
 	FN(strncmp),			\
+	FN(get_func_arg),		\
+	FN(get_func_ret),		\
+	FN(get_func_arg_cnt),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3


From 006004b715569f742535f70f3f06b41d8135486c Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Wed, 8 Dec 2021 20:32:45 +0100
Subject: selftests/bpf: Add tests for get_func_[arg|ret|arg_cnt] helpers

Adding tests for get_func_[arg|ret|arg_cnt] helpers.
Using these helpers in fentry/fexit/fmod_ret programs.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211208193245.172141-6-jolsa@kernel.org
---
 .../selftests/bpf/prog_tests/get_func_args_test.c  |  44 ++++++++
 .../selftests/bpf/progs/get_func_args_test.c       | 123 +++++++++++++++++++++
 2 files changed, 167 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/get_func_args_test.c
 create mode 100644 tools/testing/selftests/bpf/progs/get_func_args_test.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c b/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c
new file mode 100644
index 000000000000..85c427119fe9
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include "get_func_args_test.skel.h"
+
+void test_get_func_args_test(void)
+{
+	struct get_func_args_test *skel = NULL;
+	__u32 duration = 0, retval;
+	int err, prog_fd;
+
+	skel = get_func_args_test__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "get_func_args_test__open_and_load"))
+		return;
+
+	err = get_func_args_test__attach(skel);
+	if (!ASSERT_OK(err, "get_func_args_test__attach"))
+		goto cleanup;
+
+	/* This runs bpf_fentry_test* functions and triggers
+	 * fentry/fexit programs.
+	 */
+	prog_fd = bpf_program__fd(skel->progs.test1);
+	err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
+				NULL, NULL, &retval, &duration);
+	ASSERT_OK(err, "test_run");
+	ASSERT_EQ(retval, 0, "test_run");
+
+	/* This runs bpf_modify_return_test function and triggers
+	 * fmod_ret_test and fexit_test programs.
+	 */
+	prog_fd = bpf_program__fd(skel->progs.fmod_ret_test);
+	err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
+				NULL, NULL, &retval, &duration);
+	ASSERT_OK(err, "test_run");
+	ASSERT_EQ(retval, 1234, "test_run");
+
+	ASSERT_EQ(skel->bss->test1_result, 1, "test1_result");
+	ASSERT_EQ(skel->bss->test2_result, 1, "test2_result");
+	ASSERT_EQ(skel->bss->test3_result, 1, "test3_result");
+	ASSERT_EQ(skel->bss->test4_result, 1, "test4_result");
+
+cleanup:
+	get_func_args_test__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/get_func_args_test.c b/tools/testing/selftests/bpf/progs/get_func_args_test.c
new file mode 100644
index 000000000000..e0f34a55e697
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/get_func_args_test.c
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <errno.h>
+
+char _license[] SEC("license") = "GPL";
+
+__u64 test1_result = 0;
+SEC("fentry/bpf_fentry_test1")
+int BPF_PROG(test1)
+{
+	__u64 cnt = bpf_get_func_arg_cnt(ctx);
+	__u64 a = 0, z = 0, ret = 0;
+	__s64 err;
+
+	test1_result = cnt == 1;
+
+	/* valid arguments */
+	err = bpf_get_func_arg(ctx, 0, &a);
+
+	/* We need to cast access to traced function argument values with
+	 * proper type cast, because trampoline uses type specific instruction
+	 * to save it, like for 'int a' with 32-bit mov like:
+	 *
+	 *   mov %edi,-0x8(%rbp)
+	 *
+	 * so the upper 4 bytes are not zeroed.
+	 */
+	test1_result &= err == 0 && ((int) a == 1);
+
+	/* not valid argument */
+	err = bpf_get_func_arg(ctx, 1, &z);
+	test1_result &= err == -EINVAL;
+
+	/* return value fails in fentry */
+	err = bpf_get_func_ret(ctx, &ret);
+	test1_result &= err == -EOPNOTSUPP;
+	return 0;
+}
+
+__u64 test2_result = 0;
+SEC("fexit/bpf_fentry_test2")
+int BPF_PROG(test2)
+{
+	__u64 cnt = bpf_get_func_arg_cnt(ctx);
+	__u64 a = 0, b = 0, z = 0, ret = 0;
+	__s64 err;
+
+	test2_result = cnt == 2;
+
+	/* valid arguments */
+	err = bpf_get_func_arg(ctx, 0, &a);
+	test2_result &= err == 0 && (int) a == 2;
+
+	err = bpf_get_func_arg(ctx, 1, &b);
+	test2_result &= err == 0 && b == 3;
+
+	/* not valid argument */
+	err = bpf_get_func_arg(ctx, 2, &z);
+	test2_result &= err == -EINVAL;
+
+	/* return value */
+	err = bpf_get_func_ret(ctx, &ret);
+	test2_result &= err == 0 && ret == 5;
+	return 0;
+}
+
+__u64 test3_result = 0;
+SEC("fmod_ret/bpf_modify_return_test")
+int BPF_PROG(fmod_ret_test, int _a, int *_b, int _ret)
+{
+	__u64 cnt = bpf_get_func_arg_cnt(ctx);
+	__u64 a = 0, b = 0, z = 0, ret = 0;
+	__s64 err;
+
+	test3_result = cnt == 2;
+
+	/* valid arguments */
+	err = bpf_get_func_arg(ctx, 0, &a);
+	test3_result &= err == 0 && ((int) a == 1);
+
+	err = bpf_get_func_arg(ctx, 1, &b);
+	test3_result &= err == 0 && ((int *) b == _b);
+
+	/* not valid argument */
+	err = bpf_get_func_arg(ctx, 2, &z);
+	test3_result &= err == -EINVAL;
+
+	/* return value */
+	err = bpf_get_func_ret(ctx, &ret);
+	test3_result &= err == 0 && ret == 0;
+
+	/* change return value, it's checked in fexit_test program */
+	return 1234;
+}
+
+__u64 test4_result = 0;
+SEC("fexit/bpf_modify_return_test")
+int BPF_PROG(fexit_test, int _a, int *_b, int _ret)
+{
+	__u64 cnt = bpf_get_func_arg_cnt(ctx);
+	__u64 a = 0, b = 0, z = 0, ret = 0;
+	__s64 err;
+
+	test4_result = cnt == 2;
+
+	/* valid arguments */
+	err = bpf_get_func_arg(ctx, 0, &a);
+	test4_result &= err == 0 && ((int) a == 1);
+
+	err = bpf_get_func_arg(ctx, 1, &b);
+	test4_result &= err == 0 && ((int *) b == _b);
+
+	/* not valid argument */
+	err = bpf_get_func_arg(ctx, 2, &z);
+	test4_result &= err == -EINVAL;
+
+	/* return value */
+	err = bpf_get_func_ret(ctx, &ret);
+	test4_result &= err == 0 && ret == 1234;
+	return 0;
+}
-- 
cgit v1.2.3


From 0e25498f8cd43c1b5aa327f373dd094e9a006da7 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 28 Jun 2021 14:52:01 -0500
Subject: exit: Add and use make_task_dead.

There are two big uses of do_exit.  The first is it's design use to be
the guts of the exit(2) system call.  The second use is to terminate
a task after something catastrophic has happened like a NULL pointer
in kernel code.

Add a function make_task_dead that is initialy exactly the same as
do_exit to cover the cases where do_exit is called to handle
catastrophic failure.  In time this can probably be reduced to just a
light wrapper around do_task_dead. For now keep it exactly the same so
that there will be no behavioral differences introducing this new
concept.

Replace all of the uses of do_exit that use it for catastraphic
task cleanup with make_task_dead to make it clear what the code
is doing.

As part of this rename rewind_stack_do_exit
rewind_stack_and_make_dead.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 arch/alpha/kernel/traps.c           | 6 +++---
 arch/alpha/mm/fault.c               | 2 +-
 arch/arm/kernel/traps.c             | 2 +-
 arch/arm/mm/fault.c                 | 2 +-
 arch/arm64/kernel/traps.c           | 2 +-
 arch/arm64/mm/fault.c               | 2 +-
 arch/csky/abiv1/alignment.c         | 2 +-
 arch/csky/kernel/traps.c            | 2 +-
 arch/csky/mm/fault.c                | 2 +-
 arch/h8300/kernel/traps.c           | 2 +-
 arch/h8300/mm/fault.c               | 2 +-
 arch/hexagon/kernel/traps.c         | 2 +-
 arch/ia64/kernel/mca_drv.c          | 2 +-
 arch/ia64/kernel/traps.c            | 2 +-
 arch/ia64/mm/fault.c                | 2 +-
 arch/m68k/kernel/traps.c            | 2 +-
 arch/m68k/mm/fault.c                | 2 +-
 arch/microblaze/kernel/exceptions.c | 4 ++--
 arch/mips/kernel/traps.c            | 2 +-
 arch/nds32/kernel/fpu.c             | 2 +-
 arch/nds32/kernel/traps.c           | 8 ++++----
 arch/nios2/kernel/traps.c           | 4 ++--
 arch/openrisc/kernel/traps.c        | 2 +-
 arch/parisc/kernel/traps.c          | 2 +-
 arch/powerpc/kernel/traps.c         | 8 ++++----
 arch/riscv/kernel/traps.c           | 2 +-
 arch/riscv/mm/fault.c               | 2 +-
 arch/s390/kernel/dumpstack.c        | 2 +-
 arch/s390/kernel/nmi.c              | 2 +-
 arch/sh/kernel/traps.c              | 2 +-
 arch/sparc/kernel/traps_32.c        | 4 +---
 arch/sparc/kernel/traps_64.c        | 4 +---
 arch/x86/entry/entry_32.S           | 6 +++---
 arch/x86/entry/entry_64.S           | 6 +++---
 arch/x86/kernel/dumpstack.c         | 4 ++--
 arch/xtensa/kernel/traps.c          | 2 +-
 include/linux/sched/task.h          | 1 +
 kernel/exit.c                       | 9 +++++++++
 tools/objtool/check.c               | 3 ++-
 39 files changed, 63 insertions(+), 56 deletions(-)

(limited to 'tools')

diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c
index 2ae34702456c..8a66fe544c69 100644
--- a/arch/alpha/kernel/traps.c
+++ b/arch/alpha/kernel/traps.c
@@ -190,7 +190,7 @@ die_if_kernel(char * str, struct pt_regs *regs, long err, unsigned long *r9_15)
 		local_irq_enable();
 		while (1);
 	}
-	do_exit(SIGSEGV);
+	make_task_dead(SIGSEGV);
 }
 
 #ifndef CONFIG_MATHEMU
@@ -575,7 +575,7 @@ do_entUna(void * va, unsigned long opcode, unsigned long reg,
 
 	printk("Bad unaligned kernel access at %016lx: %p %lx %lu\n",
 		pc, va, opcode, reg);
-	do_exit(SIGSEGV);
+	make_task_dead(SIGSEGV);
 
 got_exception:
 	/* Ok, we caught the exception, but we don't want it.  Is there
@@ -630,7 +630,7 @@ got_exception:
 		local_irq_enable();
 		while (1);
 	}
-	do_exit(SIGSEGV);
+	make_task_dead(SIGSEGV);
 }
 
 /*
diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index eee5102c3d88..e9193d52222e 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -204,7 +204,7 @@ retry:
 	printk(KERN_ALERT "Unable to handle kernel paging request at "
 	       "virtual address %016lx\n", address);
 	die_if_kernel("Oops", regs, cause, (unsigned long*)regs - 16);
-	do_exit(SIGKILL);
+	make_task_dead(SIGKILL);
 
 	/* We ran out of memory, or some other thing happened to us that
 	   made us unable to handle the page fault gracefully.  */
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 195dff58bafc..b4bd2e5f17c1 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -333,7 +333,7 @@ static void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
 	if (panic_on_oops)
 		panic("Fatal exception");
 	if (signr)
-		do_exit(signr);
+		make_task_dead(signr);
 }
 
 /*
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index bc8779d54a64..bf1a0c618c49 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -111,7 +111,7 @@ static void die_kernel_fault(const char *msg, struct mm_struct *mm,
 	show_pte(KERN_ALERT, mm, addr);
 	die("Oops", regs, fsr);
 	bust_spinlocks(0);
-	do_exit(SIGKILL);
+	make_task_dead(SIGKILL);
 }
 
 /*
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 7b21213a570f..bdd456e4e7f4 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -235,7 +235,7 @@ void die(const char *str, struct pt_regs *regs, int err)
 	raw_spin_unlock_irqrestore(&die_lock, flags);
 
 	if (ret != NOTIFY_STOP)
-		do_exit(SIGSEGV);
+		make_task_dead(SIGSEGV);
 }
 
 static void arm64_show_signal(int signo, const char *str)
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 9ae24e3b72be..11a28cace2d2 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -302,7 +302,7 @@ static void die_kernel_fault(const char *msg, unsigned long addr,
 	show_pte(addr);
 	die("Oops", regs, esr);
 	bust_spinlocks(0);
-	do_exit(SIGKILL);
+	make_task_dead(SIGKILL);
 }
 
 #ifdef CONFIG_KASAN_HW_TAGS
diff --git a/arch/csky/abiv1/alignment.c b/arch/csky/abiv1/alignment.c
index cb2a0d94a144..5e2fb45d605c 100644
--- a/arch/csky/abiv1/alignment.c
+++ b/arch/csky/abiv1/alignment.c
@@ -294,7 +294,7 @@ bad_area:
 				__func__, opcode, rz, rx, imm, addr);
 		show_regs(regs);
 		bust_spinlocks(0);
-		do_exit(SIGKILL);
+		make_dead_task(SIGKILL);
 	}
 
 	force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)addr);
diff --git a/arch/csky/kernel/traps.c b/arch/csky/kernel/traps.c
index e5fbf8653a21..88a47035b925 100644
--- a/arch/csky/kernel/traps.c
+++ b/arch/csky/kernel/traps.c
@@ -109,7 +109,7 @@ void die(struct pt_regs *regs, const char *str)
 	if (panic_on_oops)
 		panic("Fatal exception");
 	if (ret != NOTIFY_STOP)
-		do_exit(SIGSEGV);
+		make_dead_task(SIGSEGV);
 }
 
 void do_trap(struct pt_regs *regs, int signo, int code, unsigned long addr)
diff --git a/arch/csky/mm/fault.c b/arch/csky/mm/fault.c
index 466ad949818a..7215a46b6b8e 100644
--- a/arch/csky/mm/fault.c
+++ b/arch/csky/mm/fault.c
@@ -67,7 +67,7 @@ static inline void no_context(struct pt_regs *regs, unsigned long addr)
 	pr_alert("Unable to handle kernel paging request at virtual "
 		 "addr 0x%08lx, pc: 0x%08lx\n", addr, regs->pc);
 	die(regs, "Oops");
-	do_exit(SIGKILL);
+	make_task_dead(SIGKILL);
 }
 
 static inline void mm_fault_error(struct pt_regs *regs, unsigned long addr, vm_fault_t fault)
diff --git a/arch/h8300/kernel/traps.c b/arch/h8300/kernel/traps.c
index bdbe988d8dbc..3d4e0bde37ae 100644
--- a/arch/h8300/kernel/traps.c
+++ b/arch/h8300/kernel/traps.c
@@ -106,7 +106,7 @@ void die(const char *str, struct pt_regs *fp, unsigned long err)
 	dump(fp);
 
 	spin_unlock_irq(&die_lock);
-	do_exit(SIGSEGV);
+	make_dead_task(SIGSEGV);
 }
 
 static int kstack_depth_to_print = 24;
diff --git a/arch/h8300/mm/fault.c b/arch/h8300/mm/fault.c
index d4bc9c16f2df..0223528565dd 100644
--- a/arch/h8300/mm/fault.c
+++ b/arch/h8300/mm/fault.c
@@ -51,7 +51,7 @@ asmlinkage int do_page_fault(struct pt_regs *regs, unsigned long address,
 	printk(" at virtual address %08lx\n", address);
 	if (!user_mode(regs))
 		die("Oops", regs, error_code);
-	do_exit(SIGKILL);
+	make_dead_task(SIGKILL);
 
 	return 1;
 }
diff --git a/arch/hexagon/kernel/traps.c b/arch/hexagon/kernel/traps.c
index edfc35dafeb1..6dd6cf0ab711 100644
--- a/arch/hexagon/kernel/traps.c
+++ b/arch/hexagon/kernel/traps.c
@@ -214,7 +214,7 @@ int die(const char *str, struct pt_regs *regs, long err)
 		panic("Fatal exception");
 
 	oops_exit();
-	do_exit(err);
+	make_dead_task(err);
 	return 0;
 }
 
diff --git a/arch/ia64/kernel/mca_drv.c b/arch/ia64/kernel/mca_drv.c
index 5bfc79be4cef..23c203639a96 100644
--- a/arch/ia64/kernel/mca_drv.c
+++ b/arch/ia64/kernel/mca_drv.c
@@ -176,7 +176,7 @@ mca_handler_bh(unsigned long paddr, void *iip, unsigned long ipsr)
 	spin_unlock(&mca_bh_lock);
 
 	/* This process is about to be killed itself */
-	do_exit(SIGKILL);
+	make_task_dead(SIGKILL);
 }
 
 /**
diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c
index e13cb905930f..753642366e12 100644
--- a/arch/ia64/kernel/traps.c
+++ b/arch/ia64/kernel/traps.c
@@ -85,7 +85,7 @@ die (const char *str, struct pt_regs *regs, long err)
 	if (panic_on_oops)
 		panic("Fatal exception");
 
-  	do_exit(SIGSEGV);
+	make_task_dead(SIGSEGV);
 	return 0;
 }
 
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index 02de2e70c587..4796cccbf74f 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -259,7 +259,7 @@ retry:
 		regs = NULL;
 	bust_spinlocks(0);
 	if (regs)
-		do_exit(SIGKILL);
+		make_task_dead(SIGKILL);
 	return;
 
   out_of_memory:
diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c
index 34d6458340b0..59fc63feb0dc 100644
--- a/arch/m68k/kernel/traps.c
+++ b/arch/m68k/kernel/traps.c
@@ -1131,7 +1131,7 @@ void die_if_kernel (char *str, struct pt_regs *fp, int nr)
 	pr_crit("%s: %08x\n", str, nr);
 	show_registers(fp);
 	add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
-	do_exit(SIGSEGV);
+	make_task_dead(SIGSEGV);
 }
 
 asmlinkage void set_esp0(unsigned long ssp)
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index ef46e77e97a5..fcb3a0d8421c 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -48,7 +48,7 @@ int send_fault_sig(struct pt_regs *regs)
 			pr_alert("Unable to handle kernel access");
 		pr_cont(" at virtual address %p\n", addr);
 		die_if_kernel("Oops", regs, 0 /*error_code*/);
-		do_exit(SIGKILL);
+		make_task_dead(SIGKILL);
 	}
 
 	return 1;
diff --git a/arch/microblaze/kernel/exceptions.c b/arch/microblaze/kernel/exceptions.c
index 908788497b28..fd153d5fab98 100644
--- a/arch/microblaze/kernel/exceptions.c
+++ b/arch/microblaze/kernel/exceptions.c
@@ -44,10 +44,10 @@ void die(const char *str, struct pt_regs *fp, long err)
 	pr_warn("Oops: %s, sig: %ld\n", str, err);
 	show_regs(fp);
 	spin_unlock_irq(&die_lock);
-	/* do_exit() should take care of panic'ing from an interrupt
+	/* make_task_dead() should take care of panic'ing from an interrupt
 	 * context so we don't handle it here
 	 */
-	do_exit(err);
+	make_task_dead(err);
 }
 
 /* for user application debugging */
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index d26b0fb8ea06..a486486b2355 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -422,7 +422,7 @@ void __noreturn die(const char *str, struct pt_regs *regs)
 	if (regs && kexec_should_crash(current))
 		crash_kexec(regs);
 
-	do_exit(sig);
+	make_task_dead(sig);
 }
 
 extern struct exception_table_entry __start___dbe_table[];
diff --git a/arch/nds32/kernel/fpu.c b/arch/nds32/kernel/fpu.c
index 9edd7ed7d7bf..701c09a668de 100644
--- a/arch/nds32/kernel/fpu.c
+++ b/arch/nds32/kernel/fpu.c
@@ -223,7 +223,7 @@ inline void handle_fpu_exception(struct pt_regs *regs)
 		}
 	} else if (fpcsr & FPCSR_mskRIT) {
 		if (!user_mode(regs))
-			do_exit(SIGILL);
+			make_task_dead(SIGILL);
 		si_signo = SIGILL;
 	}
 
diff --git a/arch/nds32/kernel/traps.c b/arch/nds32/kernel/traps.c
index ca75d475eda4..c0a8f3344fb9 100644
--- a/arch/nds32/kernel/traps.c
+++ b/arch/nds32/kernel/traps.c
@@ -141,7 +141,7 @@ void __noreturn die(const char *str, struct pt_regs *regs, int err)
 
 	bust_spinlocks(0);
 	spin_unlock_irq(&die_lock);
-	do_exit(SIGSEGV);
+	make_task_dead(SIGSEGV);
 }
 
 EXPORT_SYMBOL(die);
@@ -240,7 +240,7 @@ void unhandled_interruption(struct pt_regs *regs)
 	pr_emerg("unhandled_interruption\n");
 	show_regs(regs);
 	if (!user_mode(regs))
-		do_exit(SIGKILL);
+		make_task_dead(SIGKILL);
 	force_sig(SIGKILL);
 }
 
@@ -251,7 +251,7 @@ void unhandled_exceptions(unsigned long entry, unsigned long addr,
 		 addr, type);
 	show_regs(regs);
 	if (!user_mode(regs))
-		do_exit(SIGKILL);
+		make_task_dead(SIGKILL);
 	force_sig(SIGKILL);
 }
 
@@ -278,7 +278,7 @@ void do_revinsn(struct pt_regs *regs)
 	pr_emerg("Reserved Instruction\n");
 	show_regs(regs);
 	if (!user_mode(regs))
-		do_exit(SIGILL);
+		make_task_dead(SIGILL);
 	force_sig(SIGILL);
 }
 
diff --git a/arch/nios2/kernel/traps.c b/arch/nios2/kernel/traps.c
index 596986a74a26..85ac49d64cf7 100644
--- a/arch/nios2/kernel/traps.c
+++ b/arch/nios2/kernel/traps.c
@@ -37,10 +37,10 @@ void die(const char *str, struct pt_regs *regs, long err)
 	show_regs(regs);
 	spin_unlock_irq(&die_lock);
 	/*
-	 * do_exit() should take care of panic'ing from an interrupt
+	 * make_task_dead() should take care of panic'ing from an interrupt
 	 * context so we don't handle it here
 	 */
-	do_exit(err);
+	make_task_dead(err);
 }
 
 void _exception(int signo, struct pt_regs *regs, int code, unsigned long addr)
diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c
index 0898cb159fac..0446a3c34372 100644
--- a/arch/openrisc/kernel/traps.c
+++ b/arch/openrisc/kernel/traps.c
@@ -212,7 +212,7 @@ void __noreturn die(const char *str, struct pt_regs *regs, long err)
 	__asm__ __volatile__("l.nop   1");
 	do {} while (1);
 #endif
-	do_exit(SIGSEGV);
+	make_task_dead(SIGSEGV);
 }
 
 /* This is normally the 'Oops' routine */
diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c
index b11fb26ce299..df2122c50d78 100644
--- a/arch/parisc/kernel/traps.c
+++ b/arch/parisc/kernel/traps.c
@@ -269,7 +269,7 @@ void die_if_kernel(char *str, struct pt_regs *regs, long err)
 		panic("Fatal exception");
 
 	oops_exit();
-	do_exit(SIGSEGV);
+	make_task_dead(SIGSEGV);
 }
 
 /* gdb uses break 4,8 */
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 11741703d26e..a08bb7cefdc5 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -245,7 +245,7 @@ static void oops_end(unsigned long flags, struct pt_regs *regs,
 
 	if (panic_on_oops)
 		panic("Fatal exception");
-	do_exit(signr);
+	make_task_dead(signr);
 }
 NOKPROBE_SYMBOL(oops_end);
 
@@ -792,9 +792,9 @@ int machine_check_generic(struct pt_regs *regs)
 void die_mce(const char *str, struct pt_regs *regs, long err)
 {
 	/*
-	 * The machine check wants to kill the interrupted context, but
-	 * do_exit() checks for in_interrupt() and panics in that case, so
-	 * exit the irq/nmi before calling die.
+	 * The machine check wants to kill the interrupted context,
+	 * but make_task_dead() checks for in_interrupt() and panics
+	 * in that case, so exit the irq/nmi before calling die.
 	 */
 	if (in_nmi())
 		nmi_exit();
diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c
index 0daaa3e4630d..fe92e119e6a3 100644
--- a/arch/riscv/kernel/traps.c
+++ b/arch/riscv/kernel/traps.c
@@ -54,7 +54,7 @@ void die(struct pt_regs *regs, const char *str)
 	if (panic_on_oops)
 		panic("Fatal exception");
 	if (ret != NOTIFY_STOP)
-		do_exit(SIGSEGV);
+		make_task_dead(SIGSEGV);
 }
 
 void do_trap(struct pt_regs *regs, int signo, int code, unsigned long addr)
diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c
index aa08dd2f8fae..42118bc728f9 100644
--- a/arch/riscv/mm/fault.c
+++ b/arch/riscv/mm/fault.c
@@ -31,7 +31,7 @@ static void die_kernel_fault(const char *msg, unsigned long addr,
 
 	bust_spinlocks(0);
 	die(regs, "Oops");
-	do_exit(SIGKILL);
+	make_task_dead(SIGKILL);
 }
 
 static inline void no_context(struct pt_regs *regs, unsigned long addr)
diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c
index 0681c55e831d..1e3233eb510a 100644
--- a/arch/s390/kernel/dumpstack.c
+++ b/arch/s390/kernel/dumpstack.c
@@ -224,5 +224,5 @@ void __noreturn die(struct pt_regs *regs, const char *str)
 	if (panic_on_oops)
 		panic("Fatal exception: panic_on_oops");
 	oops_exit();
-	do_exit(SIGSEGV);
+	make_task_dead(SIGSEGV);
 }
diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c
index 20f8e1868853..a4d8c058dd27 100644
--- a/arch/s390/kernel/nmi.c
+++ b/arch/s390/kernel/nmi.c
@@ -175,7 +175,7 @@ void __s390_handle_mcck(void)
 		       "malfunction (code 0x%016lx).\n", mcck.mcck_code);
 		printk(KERN_EMERG "mcck: task: %s, pid: %d.\n",
 		       current->comm, current->pid);
-		do_exit(SIGSEGV);
+		make_task_dead(SIGSEGV);
 	}
 }
 
diff --git a/arch/sh/kernel/traps.c b/arch/sh/kernel/traps.c
index cbe3201d4f21..01884054aeb2 100644
--- a/arch/sh/kernel/traps.c
+++ b/arch/sh/kernel/traps.c
@@ -57,7 +57,7 @@ void __noreturn die(const char *str, struct pt_regs *regs, long err)
 	if (panic_on_oops)
 		panic("Fatal exception");
 
-	do_exit(SIGSEGV);
+	make_task_dead(SIGSEGV);
 }
 
 void die_if_kernel(const char *str, struct pt_regs *regs, long err)
diff --git a/arch/sparc/kernel/traps_32.c b/arch/sparc/kernel/traps_32.c
index 5630e5a395e0..179aabfa712e 100644
--- a/arch/sparc/kernel/traps_32.c
+++ b/arch/sparc/kernel/traps_32.c
@@ -86,9 +86,7 @@ void __noreturn die_if_kernel(char *str, struct pt_regs *regs)
 	}
 	printk("Instruction DUMP:");
 	instruction_dump ((unsigned long *) regs->pc);
-	if(regs->psr & PSR_PS)
-		do_exit(SIGKILL);
-	do_exit(SIGSEGV);
+	make_task_dead((regs->psr & PSR_PS) ? SIGKILL : SIGSEGV);
 }
 
 void do_hw_interrupt(struct pt_regs *regs, unsigned long type)
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index 6863025ed56d..21077821f427 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -2559,9 +2559,7 @@ void __noreturn die_if_kernel(char *str, struct pt_regs *regs)
 	}
 	if (panic_on_oops)
 		panic("Fatal exception");
-	if (regs->tstate & TSTATE_PRIV)
-		do_exit(SIGKILL);
-	do_exit(SIGSEGV);
+	make_task_dead((regs->tstate & TSTATE_PRIV)? SIGKILL : SIGSEGV);
 }
 EXPORT_SYMBOL(die_if_kernel);
 
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index ccb9d32768f3..7738fad6a85e 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -1248,14 +1248,14 @@ SYM_CODE_START(asm_exc_nmi)
 SYM_CODE_END(asm_exc_nmi)
 
 .pushsection .text, "ax"
-SYM_CODE_START(rewind_stack_do_exit)
+SYM_CODE_START(rewind_stack_and_make_dead)
 	/* Prevent any naive code from trying to unwind to our caller. */
 	xorl	%ebp, %ebp
 
 	movl	PER_CPU_VAR(cpu_current_top_of_stack), %esi
 	leal	-TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp
 
-	call	do_exit
+	call	make_task_dead
 1:	jmp 1b
-SYM_CODE_END(rewind_stack_do_exit)
+SYM_CODE_END(rewind_stack_and_make_dead)
 .popsection
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index e38a4cf795d9..f09276457942 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1429,7 +1429,7 @@ SYM_CODE_END(ignore_sysret)
 #endif
 
 .pushsection .text, "ax"
-SYM_CODE_START(rewind_stack_do_exit)
+SYM_CODE_START(rewind_stack_and_make_dead)
 	UNWIND_HINT_FUNC
 	/* Prevent any naive code from trying to unwind to our caller. */
 	xorl	%ebp, %ebp
@@ -1438,6 +1438,6 @@ SYM_CODE_START(rewind_stack_do_exit)
 	leaq	-PTREGS_SIZE(%rax), %rsp
 	UNWIND_HINT_REGS
 
-	call	do_exit
-SYM_CODE_END(rewind_stack_do_exit)
+	call	make_task_dead
+SYM_CODE_END(rewind_stack_and_make_dead)
 .popsection
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index ea4fe192189d..53de044e5654 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -351,7 +351,7 @@ unsigned long oops_begin(void)
 }
 NOKPROBE_SYMBOL(oops_begin);
 
-void __noreturn rewind_stack_do_exit(int signr);
+void __noreturn rewind_stack_and_make_dead(int signr);
 
 void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
 {
@@ -386,7 +386,7 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
 	 * reuse the task stack and that existing poisons are invalid.
 	 */
 	kasan_unpoison_task_stack(current);
-	rewind_stack_do_exit(signr);
+	rewind_stack_and_make_dead(signr);
 }
 NOKPROBE_SYMBOL(oops_end);
 
diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c
index 4b4dbeb2d612..9345007d474d 100644
--- a/arch/xtensa/kernel/traps.c
+++ b/arch/xtensa/kernel/traps.c
@@ -552,5 +552,5 @@ void __noreturn die(const char * str, struct pt_regs * regs, long err)
 	if (panic_on_oops)
 		panic("Fatal exception");
 
-	do_exit(err);
+	make_task_dead(err);
 }
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index ba88a6987400..2d4bbd9c3278 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -59,6 +59,7 @@ extern void sched_post_fork(struct task_struct *p,
 extern void sched_dead(struct task_struct *p);
 
 void __noreturn do_task_dead(void);
+void __noreturn make_task_dead(int signr);
 
 extern void proc_caches_init(void);
 
diff --git a/kernel/exit.c b/kernel/exit.c
index f702a6a63686..bfa513c5b227 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -884,6 +884,15 @@ void __noreturn do_exit(long code)
 }
 EXPORT_SYMBOL_GPL(do_exit);
 
+void __noreturn make_task_dead(int signr)
+{
+	/*
+	 * Take the task off the cpu after something catastrophic has
+	 * happened.
+	 */
+	do_exit(signr);
+}
+
 void complete_and_exit(struct completion *comp, long code)
 {
 	if (comp)
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 21735829b860..e6ab5687770b 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -168,6 +168,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func,
 		"panic",
 		"do_exit",
 		"do_task_dead",
+		"make_task_dead",
 		"__module_put_and_exit",
 		"complete_and_exit",
 		"__reiserfs_panic",
@@ -175,7 +176,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func,
 		"fortify_panic",
 		"usercopy_abort",
 		"machine_real_restart",
-		"rewind_stack_do_exit",
+		"rewind_stack_and_make_dead"
 		"kunit_try_catch_throw",
 		"xen_start_kernel",
 		"cpu_bringup_and_idle",
-- 
cgit v1.2.3


From bbda86e988d4c124e4cfa816291cbd583ae8bfb1 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 22 Nov 2021 10:27:36 -0600
Subject: exit: Implement kthread_exit

The way the per task_struct exit_code is used by kernel threads is not
quite compatible how it is used by userspace applications.  The low
byte of the userspace exit_code value encodes the exit signal.  While
kthreads just use the value as an int holding ordinary kernel function
exit status like -EPERM.

Add kthread_exit to clearly separate the two kinds of uses.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/kthread.h |  1 +
 kernel/kthread.c        | 23 +++++++++++++++++++----
 tools/objtool/check.c   |  1 +
 3 files changed, 21 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 346b0f269161..22c43d419687 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -70,6 +70,7 @@ void *kthread_probe_data(struct task_struct *k);
 int kthread_park(struct task_struct *k);
 void kthread_unpark(struct task_struct *k);
 void kthread_parkme(void);
+void kthread_exit(long result) __noreturn;
 
 int kthreadd(void *unused);
 extern struct task_struct *kthreadd_task;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 7113003fab63..77b7c3f23f18 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -268,6 +268,21 @@ void kthread_parkme(void)
 }
 EXPORT_SYMBOL_GPL(kthread_parkme);
 
+/**
+ * kthread_exit - Cause the current kthread return @result to kthread_stop().
+ * @result: The integer value to return to kthread_stop().
+ *
+ * While kthread_exit can be called directly, it exists so that
+ * functions which do some additional work in non-modular code such as
+ * module_put_and_kthread_exit can be implemented.
+ *
+ * Does not return.
+ */
+void __noreturn kthread_exit(long result)
+{
+	do_exit(result);
+}
+
 static int kthread(void *_create)
 {
 	static const struct sched_param param = { .sched_priority = 0 };
@@ -286,13 +301,13 @@ static int kthread(void *_create)
 	done = xchg(&create->done, NULL);
 	if (!done) {
 		kfree(create);
-		do_exit(-EINTR);
+		kthread_exit(-EINTR);
 	}
 
 	if (!self) {
 		create->result = ERR_PTR(-ENOMEM);
 		complete(done);
-		do_exit(-ENOMEM);
+		kthread_exit(-ENOMEM);
 	}
 
 	self->threadfn = threadfn;
@@ -326,7 +341,7 @@ static int kthread(void *_create)
 		__kthread_parkme(self);
 		ret = threadfn(data);
 	}
-	do_exit(ret);
+	kthread_exit(ret);
 }
 
 /* called from kernel_clone() to get node information for about to be created task */
@@ -627,7 +642,7 @@ EXPORT_SYMBOL_GPL(kthread_park);
  * instead of calling wake_up_process(): the thread will exit without
  * calling threadfn().
  *
- * If threadfn() may call do_exit() itself, the caller must ensure
+ * If threadfn() may call kthread_exit() itself, the caller must ensure
  * task_struct can't go away.
  *
  * Returns the result of threadfn(), or %-EINTR if wake_up_process()
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index e6ab5687770b..90108fe5610d 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -168,6 +168,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func,
 		"panic",
 		"do_exit",
 		"do_task_dead",
+		"kthread_exit",
 		"make_task_dead",
 		"__module_put_and_exit",
 		"complete_and_exit",
-- 
cgit v1.2.3


From ca3574bd653aba234a4b31955f2778947403be16 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 3 Dec 2021 11:00:19 -0600
Subject: exit: Rename module_put_and_exit to module_put_and_kthread_exit

Update module_put_and_exit to call kthread_exit instead of do_exit.

Change the name to reflect this change in functionality.  All of the
users of module_put_and_exit are causing the current kthread to exit
so this change makes it clear what is happening.  There is no
functional change.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 crypto/algboss.c          | 4 ++--
 fs/cifs/connect.c         | 2 +-
 fs/nfs/callback.c         | 4 ++--
 fs/nfs/nfs4state.c        | 2 +-
 fs/nfsd/nfssvc.c          | 2 +-
 include/linux/module.h    | 6 +++---
 kernel/module.c           | 6 +++---
 net/bluetooth/bnep/core.c | 2 +-
 net/bluetooth/cmtp/core.c | 2 +-
 net/bluetooth/hidp/core.c | 2 +-
 tools/objtool/check.c     | 2 +-
 11 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'tools')

diff --git a/crypto/algboss.c b/crypto/algboss.c
index 1814d2c5188a..eb5fe84efb83 100644
--- a/crypto/algboss.c
+++ b/crypto/algboss.c
@@ -67,7 +67,7 @@ out:
 	complete_all(&param->larval->completion);
 	crypto_alg_put(&param->larval->alg);
 	kfree(param);
-	module_put_and_exit(0);
+	module_put_and_kthread_exit(0);
 }
 
 static int cryptomgr_schedule_probe(struct crypto_larval *larval)
@@ -190,7 +190,7 @@ skiptest:
 	crypto_alg_tested(param->driver, err);
 
 	kfree(param);
-	module_put_and_exit(0);
+	module_put_and_kthread_exit(0);
 }
 
 static int cryptomgr_schedule_test(struct crypto_alg *alg)
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 82577a7a5bb1..39fbe9acbf51 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1139,7 +1139,7 @@ next_pdu:
 	}
 
 	memalloc_noreclaim_restore(noreclaim_flag);
-	module_put_and_exit(0);
+	module_put_and_kthread_exit(0);
 }
 
 /*
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 86d856de1389..3c86a559a321 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -93,7 +93,7 @@ nfs4_callback_svc(void *vrqstp)
 		svc_process(rqstp);
 	}
 	svc_exit_thread(rqstp);
-	module_put_and_exit(0);
+	module_put_and_kthread_exit(0);
 	return 0;
 }
 
@@ -137,7 +137,7 @@ nfs41_callback_svc(void *vrqstp)
 		}
 	}
 	svc_exit_thread(rqstp);
-	module_put_and_exit(0);
+	module_put_and_kthread_exit(0);
 	return 0;
 }
 
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index ecc4594299d6..ea41af731978 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -2689,6 +2689,6 @@ static int nfs4_run_state_manager(void *ptr)
 	allow_signal(SIGKILL);
 	nfs4_state_manager(clp);
 	nfs_put_client(clp);
-	module_put_and_exit(0);
+	module_put_and_kthread_exit(0);
 	return 0;
 }
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 80431921e5d7..5ce9f14318c4 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -986,7 +986,7 @@ out:
 
 	/* Release module */
 	mutex_unlock(&nfsd_mutex);
-	module_put_and_exit(0);
+	module_put_and_kthread_exit(0);
 	return 0;
 }
 
diff --git a/include/linux/module.h b/include/linux/module.h
index c9f1200b2312..f03be97e9ec1 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -595,9 +595,9 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 /* Look for this name: can be of form module:name. */
 unsigned long module_kallsyms_lookup_name(const char *name);
 
-extern void __noreturn __module_put_and_exit(struct module *mod,
+extern void __noreturn __module_put_and_kthread_exit(struct module *mod,
 			long code);
-#define module_put_and_exit(code) __module_put_and_exit(THIS_MODULE, code)
+#define module_put_and_kthread_exit(code) __module_put_and_kthread_exit(THIS_MODULE, code)
 
 #ifdef CONFIG_MODULE_UNLOAD
 int module_refcount(struct module *mod);
@@ -790,7 +790,7 @@ static inline int unregister_module_notifier(struct notifier_block *nb)
 	return 0;
 }
 
-#define module_put_and_exit(code) do_exit(code)
+#define module_put_and_kthread_exit(code) kthread_exit(code)
 
 static inline void print_modules(void)
 {
diff --git a/kernel/module.c b/kernel/module.c
index 84a9141a5e15..a3aa00bf270d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -337,12 +337,12 @@ static inline void add_taint_module(struct module *mod, unsigned flag,
  * A thread that wants to hold a reference to a module only while it
  * is running can call this to safely exit.  nfsd and lockd use this.
  */
-void __noreturn __module_put_and_exit(struct module *mod, long code)
+void __noreturn __module_put_and_kthread_exit(struct module *mod, long code)
 {
 	module_put(mod);
-	do_exit(code);
+	kthread_exit(code);
 }
-EXPORT_SYMBOL(__module_put_and_exit);
+EXPORT_SYMBOL(__module_put_and_kthread_exit);
 
 /* Find a module section: 0 means not found. */
 static unsigned int find_sec(const struct load_info *info, const char *name)
diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c
index c9add7753b9f..40baa6b7321a 100644
--- a/net/bluetooth/bnep/core.c
+++ b/net/bluetooth/bnep/core.c
@@ -535,7 +535,7 @@ static int bnep_session(void *arg)
 
 	up_write(&bnep_session_sem);
 	free_netdev(dev);
-	module_put_and_exit(0);
+	module_put_and_kthread_exit(0);
 	return 0;
 }
 
diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
index 0a2d78e811cf..9bfded6b74b3 100644
--- a/net/bluetooth/cmtp/core.c
+++ b/net/bluetooth/cmtp/core.c
@@ -323,7 +323,7 @@ static int cmtp_session(void *arg)
 	up_write(&cmtp_session_sem);
 
 	kfree(session);
-	module_put_and_exit(0);
+	module_put_and_kthread_exit(0);
 	return 0;
 }
 
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 80848dfc01db..5940744a8cd8 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -1305,7 +1305,7 @@ static int hidp_session_thread(void *arg)
 	l2cap_unregister_user(session->conn, &session->user);
 	hidp_session_put(session);
 
-	module_put_and_exit(0);
+	module_put_and_kthread_exit(0);
 	return 0;
 }
 
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 90108fe5610d..120e9598c11a 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -170,7 +170,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func,
 		"do_task_dead",
 		"kthread_exit",
 		"make_task_dead",
-		"__module_put_and_exit",
+		"__module_put_and_kthread_exit",
 		"complete_and_exit",
 		"__reiserfs_panic",
 		"lbug_with_loc",
-- 
cgit v1.2.3


From cead18552660702a4a46f58e65188fe5f36e9dfe Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 22 Nov 2021 11:15:19 -0600
Subject: exit: Rename complete_and_exit to kthread_complete_and_exit

Update complete_and_exit to call kthread_exit instead of do_exit.

Change the name to reflect this change in functionality.  All of the
users of complete_and_exit are causing the current kthread to exit so
this change makes it clear what is happening.

Move the implementation of kthread_complete_and_exit from
kernel/exit.c to to kernel/kthread.c.  As this function is kthread
specific it makes most sense to live with the kthread functions.

There are no functional change.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 drivers/net/wireless/rsi/rsi_91x_coex.c      |  2 +-
 drivers/net/wireless/rsi/rsi_91x_main.c      |  2 +-
 drivers/net/wireless/rsi/rsi_91x_sdio_ops.c  |  2 +-
 drivers/net/wireless/rsi/rsi_91x_usb_ops.c   |  2 +-
 drivers/pnp/pnpbios/core.c                   |  6 +++---
 drivers/staging/rts5208/rtsx.c               | 16 ++++++++--------
 drivers/usb/atm/usbatm.c                     |  2 +-
 drivers/usb/gadget/function/f_mass_storage.c |  2 +-
 fs/jffs2/background.c                        |  2 +-
 include/linux/kernel.h                       |  1 -
 include/linux/kthread.h                      |  1 +
 kernel/exit.c                                |  9 ---------
 kernel/kthread.c                             | 21 +++++++++++++++++++++
 lib/kunit/try-catch.c                        |  4 ++--
 tools/objtool/check.c                        |  2 +-
 15 files changed, 43 insertions(+), 31 deletions(-)

(limited to 'tools')

diff --git a/drivers/net/wireless/rsi/rsi_91x_coex.c b/drivers/net/wireless/rsi/rsi_91x_coex.c
index a0c5d02ae88c..8a3d86897ea8 100644
--- a/drivers/net/wireless/rsi/rsi_91x_coex.c
+++ b/drivers/net/wireless/rsi/rsi_91x_coex.c
@@ -63,7 +63,7 @@ static void rsi_coex_scheduler_thread(struct rsi_common *common)
 		rsi_coex_sched_tx_pkts(coex_cb);
 	} while (atomic_read(&coex_cb->coex_tx_thread.thread_done) == 0);
 
-	complete_and_exit(&coex_cb->coex_tx_thread.completion, 0);
+	kthread_complete_and_exit(&coex_cb->coex_tx_thread.completion, 0);
 }
 
 int rsi_coex_recv_pkt(struct rsi_common *common, u8 *msg)
diff --git a/drivers/net/wireless/rsi/rsi_91x_main.c b/drivers/net/wireless/rsi/rsi_91x_main.c
index f1bf71e6c608..c7f5cec5e446 100644
--- a/drivers/net/wireless/rsi/rsi_91x_main.c
+++ b/drivers/net/wireless/rsi/rsi_91x_main.c
@@ -260,7 +260,7 @@ static void rsi_tx_scheduler_thread(struct rsi_common *common)
 		if (common->init_done)
 			rsi_core_qos_processor(common);
 	} while (atomic_read(&common->tx_thread.thread_done) == 0);
-	complete_and_exit(&common->tx_thread.completion, 0);
+	kthread_complete_and_exit(&common->tx_thread.completion, 0);
 }
 
 #ifdef CONFIG_RSI_COEX
diff --git a/drivers/net/wireless/rsi/rsi_91x_sdio_ops.c b/drivers/net/wireless/rsi/rsi_91x_sdio_ops.c
index 8ace1874e5cb..b2b47a0abcbf 100644
--- a/drivers/net/wireless/rsi/rsi_91x_sdio_ops.c
+++ b/drivers/net/wireless/rsi/rsi_91x_sdio_ops.c
@@ -75,7 +75,7 @@ void rsi_sdio_rx_thread(struct rsi_common *common)
 
 	rsi_dbg(INFO_ZONE, "%s: Terminated SDIO RX thread\n", __func__);
 	atomic_inc(&sdev->rx_thread.thread_done);
-	complete_and_exit(&sdev->rx_thread.completion, 0);
+	kthread_complete_and_exit(&sdev->rx_thread.completion, 0);
 }
 
 /**
diff --git a/drivers/net/wireless/rsi/rsi_91x_usb_ops.c b/drivers/net/wireless/rsi/rsi_91x_usb_ops.c
index 4ffcdde1acb1..5130b0e72adc 100644
--- a/drivers/net/wireless/rsi/rsi_91x_usb_ops.c
+++ b/drivers/net/wireless/rsi/rsi_91x_usb_ops.c
@@ -56,6 +56,6 @@ void rsi_usb_rx_thread(struct rsi_common *common)
 out:
 	rsi_dbg(INFO_ZONE, "%s: Terminated thread\n", __func__);
 	skb_queue_purge(&dev->rx_q);
-	complete_and_exit(&dev->rx_thread.completion, 0);
+	kthread_complete_and_exit(&dev->rx_thread.completion, 0);
 }
 
diff --git a/drivers/pnp/pnpbios/core.c b/drivers/pnp/pnpbios/core.c
index 669ef4700c1a..f7e86ae9f72f 100644
--- a/drivers/pnp/pnpbios/core.c
+++ b/drivers/pnp/pnpbios/core.c
@@ -160,7 +160,7 @@ static int pnp_dock_thread(void *unused)
 			 * No dock to manage
 			 */
 		case PNP_FUNCTION_NOT_SUPPORTED:
-			complete_and_exit(&unload_sem, 0);
+			kthread_complete_and_exit(&unload_sem, 0);
 		case PNP_SYSTEM_NOT_DOCKED:
 			d = 0;
 			break;
@@ -170,7 +170,7 @@ static int pnp_dock_thread(void *unused)
 		default:
 			pnpbios_print_status("pnp_dock_thread", status);
 			printk(KERN_WARNING "PnPBIOS: disabling dock monitoring.\n");
-			complete_and_exit(&unload_sem, 0);
+			kthread_complete_and_exit(&unload_sem, 0);
 		}
 		if (d != docked) {
 			if (pnp_dock_event(d, &now) == 0) {
@@ -183,7 +183,7 @@ static int pnp_dock_thread(void *unused)
 			}
 		}
 	}
-	complete_and_exit(&unload_sem, 0);
+	kthread_complete_and_exit(&unload_sem, 0);
 }
 
 static int pnpbios_get_resources(struct pnp_dev *dev)
diff --git a/drivers/staging/rts5208/rtsx.c b/drivers/staging/rts5208/rtsx.c
index 91fcf85e150a..5a58dac76c88 100644
--- a/drivers/staging/rts5208/rtsx.c
+++ b/drivers/staging/rts5208/rtsx.c
@@ -450,13 +450,13 @@ skip_for_abort:
 	 * after the down() -- that's necessary for the thread-shutdown
 	 * case.
 	 *
-	 * complete_and_exit() goes even further than this -- it is safe in
-	 * the case that the thread of the caller is going away (not just
-	 * the structure) -- this is necessary for the module-remove case.
-	 * This is important in preemption kernels, which transfer the flow
-	 * of execution immediately upon a complete().
+	 * kthread_complete_and_exit() goes even further than this --
+	 * it is safe in the case that the thread of the caller is going away
+	 * (not just the structure) -- this is necessary for the module-remove
+	 * case.  This is important in preemption kernels, which transfer the
+	 * flow of execution immediately upon a complete().
 	 */
-	complete_and_exit(&dev->control_exit, 0);
+	kthread_complete_and_exit(&dev->control_exit, 0);
 }
 
 static int rtsx_polling_thread(void *__dev)
@@ -501,7 +501,7 @@ static int rtsx_polling_thread(void *__dev)
 		mutex_unlock(&dev->dev_mutex);
 	}
 
-	complete_and_exit(&dev->polling_exit, 0);
+	kthread_complete_and_exit(&dev->polling_exit, 0);
 }
 
 /*
@@ -682,7 +682,7 @@ static int rtsx_scan_thread(void *__dev)
 		/* Should we unbind if no devices were detected? */
 	}
 
-	complete_and_exit(&dev->scanning_done, 0);
+	kthread_complete_and_exit(&dev->scanning_done, 0);
 }
 
 static void rtsx_init_options(struct rtsx_chip *chip)
diff --git a/drivers/usb/atm/usbatm.c b/drivers/usb/atm/usbatm.c
index da17be1ef64e..e3a49d837609 100644
--- a/drivers/usb/atm/usbatm.c
+++ b/drivers/usb/atm/usbatm.c
@@ -969,7 +969,7 @@ static int usbatm_do_heavy_init(void *arg)
 	instance->thread = NULL;
 	mutex_unlock(&instance->serialize);
 
-	complete_and_exit(&instance->thread_exited, ret);
+	kthread_complete_and_exit(&instance->thread_exited, ret);
 }
 
 static int usbatm_heavy_init(struct usbatm_data *instance)
diff --git a/drivers/usb/gadget/function/f_mass_storage.c b/drivers/usb/gadget/function/f_mass_storage.c
index 752439690fda..46dd11dcb3a8 100644
--- a/drivers/usb/gadget/function/f_mass_storage.c
+++ b/drivers/usb/gadget/function/f_mass_storage.c
@@ -2547,7 +2547,7 @@ static int fsg_main_thread(void *common_)
 	up_write(&common->filesem);
 
 	/* Let fsg_unbind() know the thread has exited */
-	complete_and_exit(&common->thread_notifier, 0);
+	kthread_complete_and_exit(&common->thread_notifier, 0);
 }
 
 
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 2b4d5013dc5d..6da92ecaf66d 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -161,5 +161,5 @@ static int jffs2_garbage_collect_thread(void *_c)
 	spin_lock(&c->erase_completion_lock);
 	c->gc_task = NULL;
 	spin_unlock(&c->erase_completion_lock);
-	complete_and_exit(&c->gc_thread_exit, 0);
+	kthread_complete_and_exit(&c->gc_thread_exit, 0);
 }
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 77755ac3e189..055eb203c00e 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -187,7 +187,6 @@ static inline void might_fault(void) { }
 #endif
 
 void do_exit(long error_code) __noreturn;
-void complete_and_exit(struct completion *, long) __noreturn;
 
 extern int num_to_str(char *buf, int size,
 		      unsigned long long num, unsigned int width);
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 22c43d419687..d86a7e3b9a52 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -71,6 +71,7 @@ int kthread_park(struct task_struct *k);
 void kthread_unpark(struct task_struct *k);
 void kthread_parkme(void);
 void kthread_exit(long result) __noreturn;
+void kthread_complete_and_exit(struct completion *, long) __noreturn;
 
 int kthreadd(void *unused);
 extern struct task_struct *kthreadd_task;
diff --git a/kernel/exit.c b/kernel/exit.c
index 57afac845a0a..6c4b04531f17 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -891,15 +891,6 @@ void __noreturn make_task_dead(int signr)
 	do_exit(signr);
 }
 
-void complete_and_exit(struct completion *comp, long code)
-{
-	if (comp)
-		complete(comp);
-
-	do_exit(code);
-}
-EXPORT_SYMBOL(complete_and_exit);
-
 SYSCALL_DEFINE1(exit, int, error_code)
 {
 	do_exit((error_code&0xff)<<8);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 77b7c3f23f18..4388d6694a7f 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -283,6 +283,27 @@ void __noreturn kthread_exit(long result)
 	do_exit(result);
 }
 
+/**
+ * kthread_complete_and exit - Exit the current kthread.
+ * @comp: Completion to complete
+ * @code: The integer value to return to kthread_stop().
+ *
+ * If present complete @comp and the reuturn code to kthread_stop().
+ *
+ * A kernel thread whose module may be removed after the completion of
+ * @comp can use this function exit safely.
+ *
+ * Does not return.
+ */
+void __noreturn kthread_complete_and_exit(struct completion *comp, long code)
+{
+	if (comp)
+		complete(comp);
+
+	kthread_exit(code);
+}
+EXPORT_SYMBOL(kthread_complete_and_exit);
+
 static int kthread(void *_create)
 {
 	static const struct sched_param param = { .sched_priority = 0 };
diff --git a/lib/kunit/try-catch.c b/lib/kunit/try-catch.c
index 0dd434e40487..be38a2c5ecc2 100644
--- a/lib/kunit/try-catch.c
+++ b/lib/kunit/try-catch.c
@@ -17,7 +17,7 @@
 void __noreturn kunit_try_catch_throw(struct kunit_try_catch *try_catch)
 {
 	try_catch->try_result = -EFAULT;
-	complete_and_exit(try_catch->try_completion, -EFAULT);
+	kthread_complete_and_exit(try_catch->try_completion, -EFAULT);
 }
 EXPORT_SYMBOL_GPL(kunit_try_catch_throw);
 
@@ -27,7 +27,7 @@ static int kunit_generic_run_threadfn_adapter(void *data)
 
 	try_catch->try(try_catch->context);
 
-	complete_and_exit(try_catch->try_completion, 0);
+	kthread_complete_and_exit(try_catch->try_completion, 0);
 }
 
 static unsigned long kunit_test_timeout(void)
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 120e9598c11a..282273a1ffa5 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -171,7 +171,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func,
 		"kthread_exit",
 		"make_task_dead",
 		"__module_put_and_kthread_exit",
-		"complete_and_exit",
+		"kthread_complete_and_exit",
 		"__reiserfs_panic",
 		"lbug_with_loc",
 		"fortify_panic",
-- 
cgit v1.2.3


From 9a6bb30a8830bb868b09629f0b9ad5d2b5fbb2f9 Mon Sep 17 00:00:00 2001
From: Daniel Latypov <dlatypov@google.com>
Date: Fri, 8 Oct 2021 16:24:21 -0700
Subject: kunit: tool: fix --json output for skipped tests

Currently, KUnit will report SKIPPED tests as having failed if one uses
--json.

Add the missing if statement to set the appropriate status ("SKIP").
See https://api.kernelci.org/schema-test-case.html:
  "status": {
      "type": "string",
      "description": "The status of the execution of this test case",
      "enum": ["PASS", "FAIL", "SKIP", "ERROR"],
      "default": "PASS"
  },
with this, we now can properly produce all four of the statuses.

Fixes: 5acaf6031f53 ("kunit: tool: Support skipped tests in kunit_tool")
Signed-off-by: Daniel Latypov <dlatypov@google.com>
Reviewed-by: David Gow <davidgow@google.com>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit_json.py      | 2 ++
 tools/testing/kunit/kunit_tool_test.py | 6 ++++++
 2 files changed, 8 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/kunit/kunit_json.py b/tools/testing/kunit/kunit_json.py
index 746bec72b9ac..b6e66c5d64d1 100644
--- a/tools/testing/kunit/kunit_json.py
+++ b/tools/testing/kunit/kunit_json.py
@@ -30,6 +30,8 @@ def _get_group_json(test: Test, def_config: str,
 			test_case = {"name": subtest.name, "status": "FAIL"}
 			if subtest.status == TestStatus.SUCCESS:
 				test_case["status"] = "PASS"
+			elif subtest.status == TestStatus.SKIPPED:
+				test_case["status"] = "SKIP"
 			elif subtest.status == TestStatus.TEST_CRASHED:
 				test_case["status"] = "ERROR"
 			test_cases.append(test_case)
diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py
index 9c4126731457..34cb0a12ba18 100755
--- a/tools/testing/kunit/kunit_tool_test.py
+++ b/tools/testing/kunit/kunit_tool_test.py
@@ -383,6 +383,12 @@ class KUnitJsonTest(unittest.TestCase):
 			{'name': 'example_simple_test', 'status': 'ERROR'},
 			result["sub_groups"][1]["test_cases"][0])
 
+	def test_skipped_test_json(self):
+		result = self._json_for('test_skip_tests.log')
+		self.assertEqual(
+			{'name': 'example_skip_test', 'status': 'SKIP'},
+			result["sub_groups"][1]["test_cases"][1])
+
 	def test_no_tests_json(self):
 		result = self._json_for('test_is_test_passed-no_tests_run_with_header.log')
 		self.assertEqual(0, len(result['sub_groups']))
-- 
cgit v1.2.3


From ee92ed38364e7073bb741577d7b243b232442014 Mon Sep 17 00:00:00 2001
From: Daniel Latypov <dlatypov@google.com>
Date: Tue, 2 Nov 2021 21:22:28 -0700
Subject: kunit: add run_checks.py script to validate kunit changes

This formalizes the checks KUnit maintainers have been running (or in
other cases: forgetting to run).

This script also runs them all in parallel to minimize friction (pytype
can be fairly slow, but not slower than running kunit.py).

Example output:
$ ./tools/testing/kunit/run_checks.py
Waiting on 4 checks (kunit_tool_test.py, kunit smoke test, pytype, mypy)...
kunit_tool_test.py: PASSED
mypy: PASSED
pytype: PASSED
kunit smoke test: PASSED

On failure or timeout (5 minutes), it'll dump out the stdout/stderr.
E.g. adding in a type-checking error:
  mypy: FAILED
  > kunit.py:54: error: Name 'nonexistent_function' is not defined
  > Found 1 error in 1 file (checked 8 source files)

mypy and pytype are two Python type-checkers and must be installed.
This file treats them as optional and will mark them as SKIPPED if not
installed.

This tool also runs `kunit.py run --kunitconfig=lib/kunit` to run
KUnit's own KUnit tests and to verify KUnit kernel code and kunit.py
play nicely together.

It uses --build_dir=kunit_run_checks so as not to clobber the default
build_dir, which helps make it faster by reducing the need to rebuild,
esp. if you're been passing in --arch instead of using UML.

Signed-off-by: Daniel Latypov <dlatypov@google.com>
Reviewed-by: David Gow <davidgow@google.com>
Reviewed-by: David Gow <davidgow@google.com>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/run_checks.py | 81 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100755 tools/testing/kunit/run_checks.py

(limited to 'tools')

diff --git a/tools/testing/kunit/run_checks.py b/tools/testing/kunit/run_checks.py
new file mode 100755
index 000000000000..4f32133ed77c
--- /dev/null
+++ b/tools/testing/kunit/run_checks.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+#
+# This file runs some basic checks to verify kunit works.
+# It is only of interest if you're making changes to KUnit itself.
+#
+# Copyright (C) 2021, Google LLC.
+# Author: Daniel Latypov <dlatypov@google.com.com>
+
+from concurrent import futures
+import datetime
+import os
+import shutil
+import subprocess
+import sys
+import textwrap
+from typing import Dict, List, Sequence, Tuple
+
+ABS_TOOL_PATH = os.path.abspath(os.path.dirname(__file__))
+TIMEOUT = datetime.timedelta(minutes=5).total_seconds()
+
+commands: Dict[str, Sequence[str]] = {
+	'kunit_tool_test.py': ['./kunit_tool_test.py'],
+	'kunit smoke test': ['./kunit.py', 'run', '--kunitconfig=lib/kunit', '--build_dir=kunit_run_checks'],
+	'pytype': ['/bin/sh', '-c', 'pytype *.py'],
+	'mypy': ['/bin/sh', '-c', 'mypy *.py'],
+}
+
+# The user might not have mypy or pytype installed, skip them if so.
+# Note: you can install both via `$ pip install mypy pytype`
+necessary_deps : Dict[str, str] = {
+	'pytype': 'pytype',
+	'mypy': 'mypy',
+}
+
+def main(argv: Sequence[str]) -> None:
+	if argv:
+		raise RuntimeError('This script takes no arguments')
+
+	future_to_name: Dict[futures.Future, str] = {}
+	executor = futures.ThreadPoolExecutor(max_workers=len(commands))
+	for name, argv in commands.items():
+		if name in necessary_deps and shutil.which(necessary_deps[name]) is None:
+			print(f'{name}: SKIPPED, {necessary_deps[name]} not in $PATH')
+			continue
+		f = executor.submit(run_cmd, argv)
+		future_to_name[f] = name
+
+	has_failures = False
+	print(f'Waiting on {len(future_to_name)} checks ({", ".join(future_to_name.values())})...')
+	for f in  futures.as_completed(future_to_name.keys()):
+		name = future_to_name[f]
+		ex = f.exception()
+		if not ex:
+			print(f'{name}: PASSED')
+			continue
+
+		has_failures = True
+		if isinstance(ex, subprocess.TimeoutExpired):
+			print(f'{name}: TIMED OUT')
+		elif isinstance(ex, subprocess.CalledProcessError):
+			print(f'{name}: FAILED')
+		else:
+			print('{name}: unexpected exception: {ex}')
+			continue
+
+		output = ex.output
+		if output:
+			print(textwrap.indent(output.decode(), '> '))
+	executor.shutdown()
+
+	if has_failures:
+		sys.exit(1)
+
+
+def run_cmd(argv: Sequence[str]):
+	subprocess.check_output(argv, stderr=subprocess.STDOUT, cwd=ABS_TOOL_PATH, timeout=TIMEOUT)
+
+
+if __name__ == '__main__':
+	main(sys.argv[1:])
-- 
cgit v1.2.3


From c68077b14692f4a5d42334625d6d4924b234791e Mon Sep 17 00:00:00 2001
From: David Gow <davidgow@google.com>
Date: Tue, 2 Nov 2021 00:30:11 -0700
Subject: kunit: tool: Do not error on tests without test plans

The (K)TAP spec encourages test output to begin with a 'test plan': a
count of the number of tests being run of the form:
1..n

However, some test suites might not know the number of subtests in
advance (for example, KUnit's parameterised tests use a generator
function). In this case, it's not possible to print the test plan in
advance.

kunit_tool already parses test output which doesn't contain a plan, but
reports an error. Since we want to use nested subtests with KUnit
paramterised tests, remove this error.

Signed-off-by: David Gow <davidgow@google.com>
Reviewed-by: Daniel Latypov <dlatypov@google.com>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit_parser.py    | 5 ++---
 tools/testing/kunit/kunit_tool_test.py | 5 ++++-
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/kunit/kunit_parser.py b/tools/testing/kunit/kunit_parser.py
index 3355196d0515..50ded55c168c 100644
--- a/tools/testing/kunit/kunit_parser.py
+++ b/tools/testing/kunit/kunit_parser.py
@@ -340,8 +340,8 @@ def parse_test_plan(lines: LineStream, test: Test) -> bool:
 	"""
 	Parses test plan line and stores the expected number of subtests in
 	test object. Reports an error if expected count is 0.
-	Returns False and reports missing test plan error if fails to parse
-	test plan.
+	Returns False and sets expected_count to None if there is no valid test
+	plan.
 
 	Accepted format:
 	- '1..[number of subtests]'
@@ -356,7 +356,6 @@ def parse_test_plan(lines: LineStream, test: Test) -> bool:
 	match = TEST_PLAN.match(lines.peek())
 	if not match:
 		test.expected_count = None
-		test.add_error('missing plan line!')
 		return False
 	test.log.append(lines.pop())
 	expected_count = int(match.group(1))
diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py
index 34cb0a12ba18..0fcdddf0124c 100755
--- a/tools/testing/kunit/kunit_tool_test.py
+++ b/tools/testing/kunit/kunit_tool_test.py
@@ -191,7 +191,10 @@ class KUnitParserTest(unittest.TestCase):
 			result = kunit_parser.parse_run_tests(
 				kunit_parser.extract_tap_lines(
 				file.readlines()))
-		self.assertEqual(2, result.test.counts.errors)
+		# A missing test plan is not an error.
+		self.assertEqual(0, result.test.counts.errors)
+		# All tests should be accounted for.
+		self.assertEqual(10, result.test.counts.total())
 		self.assertEqual(
 			kunit_parser.TestStatus.SUCCESS,
 			result.status)
-- 
cgit v1.2.3


From e56e482855b782b0e38b91cc383324bb80461073 Mon Sep 17 00:00:00 2001
From: David Gow <davidgow@google.com>
Date: Tue, 2 Nov 2021 00:30:12 -0700
Subject: kunit: tool: Report an error if any test has no subtests

It's possible for a test to have a subtest header, but zero valid
subtests. We used to error on this if the test plan had no subtests
listed, but it's possible to have subtests without a test plan (indeed,
this is how parameterised tests work).

Tests with 0 subtests now have the result NO_TESTS, and will report an
error (which does not halt test execution, but is printed in a scary red
colour and is noted in the results summary).

Signed-off-by: David Gow <davidgow@google.com>
Reviewed-by: Daniel Latypov <dlatypov@google.com>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit_parser.py                      | 16 +++++++++++-----
 tools/testing/kunit/kunit_tool_test.py                   | 12 ++++++++++++
 .../test_data/test_is_test_passed-no_tests_no_plan.log   |  7 +++++++
 3 files changed, 30 insertions(+), 5 deletions(-)
 create mode 100644 tools/testing/kunit/test_data/test_is_test_passed-no_tests_no_plan.log

(limited to 'tools')

diff --git a/tools/testing/kunit/kunit_parser.py b/tools/testing/kunit/kunit_parser.py
index 50ded55c168c..68c847e8ca58 100644
--- a/tools/testing/kunit/kunit_parser.py
+++ b/tools/testing/kunit/kunit_parser.py
@@ -360,9 +360,6 @@ def parse_test_plan(lines: LineStream, test: Test) -> bool:
 	test.log.append(lines.pop())
 	expected_count = int(match.group(1))
 	test.expected_count = expected_count
-	if expected_count == 0:
-		test.status = TestStatus.NO_TESTS
-		test.add_error('0 tests run!')
 	return True
 
 TEST_RESULT = re.compile(r'^(ok|not ok) ([0-9]+) (- )?([^#]*)( # .*)?$')
@@ -589,6 +586,8 @@ def format_test_result(test: Test) -> str:
 		return (green('[PASSED] ') + test.name)
 	elif test.status == TestStatus.SKIPPED:
 		return (yellow('[SKIPPED] ') + test.name)
+	elif test.status == TestStatus.NO_TESTS:
+		return (yellow('[NO TESTS RUN] ') + test.name)
 	elif test.status == TestStatus.TEST_CRASHED:
 		print_log(test.log)
 		return (red('[CRASHED] ') + test.name)
@@ -731,6 +730,7 @@ def parse_test(lines: LineStream, expected_num: int, log: List[str]) -> Test:
 		# test plan
 		test.name = "main"
 		parse_test_plan(lines, test)
+		parent_test = True
 	else:
 		# If KTAP/TAP header is not found, test must be subtest
 		# header or test result line so parse attempt to parser
@@ -744,7 +744,7 @@ def parse_test(lines: LineStream, expected_num: int, log: List[str]) -> Test:
 	expected_count = test.expected_count
 	subtests = []
 	test_num = 1
-	while expected_count is None or test_num <= expected_count:
+	while parent_test and (expected_count is None or test_num <= expected_count):
 		# Loop to parse any subtests.
 		# Break after parsing expected number of tests or
 		# if expected number of tests is unknown break when test
@@ -779,9 +779,15 @@ def parse_test(lines: LineStream, expected_num: int, log: List[str]) -> Test:
 			parse_test_result(lines, test, expected_num)
 		else:
 			test.add_error('missing subtest result line!')
+
+	# Check for there being no tests
+	if parent_test and len(subtests) == 0:
+		test.status = TestStatus.NO_TESTS
+		test.add_error('0 tests run!')
+
 	# Add statuses to TestCounts attribute in Test object
 	bubble_up_test_results(test)
-	if parent_test:
+	if parent_test and not main:
 		# If test has subtests and is not the main test object, print
 		# footer.
 		print_test_footer(test)
diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py
index 0fcdddf0124c..1b93f1151fcf 100755
--- a/tools/testing/kunit/kunit_tool_test.py
+++ b/tools/testing/kunit/kunit_tool_test.py
@@ -209,6 +209,18 @@ class KUnitParserTest(unittest.TestCase):
 			kunit_parser.TestStatus.NO_TESTS,
 			result.status)
 
+	def test_no_tests_no_plan(self):
+		no_plan_log = test_data_path('test_is_test_passed-no_tests_no_plan.log')
+		with open(no_plan_log) as file:
+			result = kunit_parser.parse_run_tests(
+				kunit_parser.extract_tap_lines(file.readlines()))
+		self.assertEqual(0, len(result.test.subtests[0].subtests[0].subtests))
+		self.assertEqual(
+			kunit_parser.TestStatus.NO_TESTS,
+			result.test.subtests[0].subtests[0].status)
+		self.assertEqual(1, result.test.counts.errors)
+
+
 	def test_no_kunit_output(self):
 		crash_log = test_data_path('test_insufficient_memory.log')
 		print_mock = mock.patch('builtins.print').start()
diff --git a/tools/testing/kunit/test_data/test_is_test_passed-no_tests_no_plan.log b/tools/testing/kunit/test_data/test_is_test_passed-no_tests_no_plan.log
new file mode 100644
index 000000000000..dd873c981108
--- /dev/null
+++ b/tools/testing/kunit/test_data/test_is_test_passed-no_tests_no_plan.log
@@ -0,0 +1,7 @@
+TAP version 14
+1..1
+  # Subtest: suite
+  1..1
+    # Subtest: case
+  ok 1 - case # SKIP
+ok 1 - suite
-- 
cgit v1.2.3


From 142189f09cdf89e353e4a2dc558a7ad27b25f36d Mon Sep 17 00:00:00 2001
From: Daniel Latypov <dlatypov@google.com>
Date: Thu, 7 Oct 2021 14:14:17 -0700
Subject: kunit: tool: print parsed test results fully incrementally

With the parser rework [1] and run_kernel() rework [2], this allows the
parser to print out test results incrementally.

Currently, that's held up by the fact that the LineStream eagerly
pre-fetches the next line when you call pop().
This blocks parse_test_result() from returning until the line *after*
the "ok 1 - test name" line is also printed.

One can see this with the following example:
$ (echo -e 'TAP version 14\n1..3\nok 1 - fake test'; sleep 2; echo -e 'ok 2 - fake test 2'; sleep 3; echo -e 'ok 3 - fake test 3') | ./tools/testing/kunit/kunit.py parse

Before this patch [1]: there's a pause before 'fake test' is printed.
After this patch: 'fake test' is printed out immediately.

This patch also adds
* a unit test to verify LineStream's behavior directly
* a test case to ensure that it's lazily calling the generator
* an explicit exception for when users go beyond EOF

[1] https://lore.kernel.org/linux-kselftest/20211006170049.106852-1-dlatypov@google.com/
[2] https://lore.kernel.org/linux-kselftest/20211005011340.2826268-1-dlatypov@google.com/

Signed-off-by: Daniel Latypov <dlatypov@google.com>
Reviewed-by: David Gow <davidgow@google.com>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit_parser.py    | 22 +++++++++++++-----
 tools/testing/kunit/kunit_tool_test.py | 42 +++++++++++++++++++++++++++++++++-
 2 files changed, 57 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/kunit/kunit_parser.py b/tools/testing/kunit/kunit_parser.py
index 68c847e8ca58..8e42b6ef3fe3 100644
--- a/tools/testing/kunit/kunit_parser.py
+++ b/tools/testing/kunit/kunit_parser.py
@@ -168,42 +168,51 @@ class TestCounts:
 class LineStream:
 	"""
 	A class to represent the lines of kernel output.
-	Provides a peek()/pop() interface over an iterator of
+	Provides a lazy peek()/pop() interface over an iterator of
 	(line#, text).
 	"""
 	_lines: Iterator[Tuple[int, str]]
 	_next: Tuple[int, str]
+	_need_next: bool
 	_done: bool
 
 	def __init__(self, lines: Iterator[Tuple[int, str]]):
 		"""Creates a new LineStream that wraps the given iterator."""
 		self._lines = lines
 		self._done = False
+		self._need_next = True
 		self._next = (0, '')
-		self._get_next()
 
 	def _get_next(self) -> None:
-		"""Advances the LineSteam to the next line."""
+		"""Advances the LineSteam to the next line, if necessary."""
+		if not self._need_next:
+			return
 		try:
 			self._next = next(self._lines)
 		except StopIteration:
 			self._done = True
+		finally:
+			self._need_next = False
 
 	def peek(self) -> str:
 		"""Returns the current line, without advancing the LineStream.
 		"""
+		self._get_next()
 		return self._next[1]
 
 	def pop(self) -> str:
 		"""Returns the current line and advances the LineStream to
 		the next line.
 		"""
-		n = self._next
-		self._get_next()
-		return n[1]
+		s = self.peek()
+		if self._done:
+			raise ValueError(f'LineStream: going past EOF, last line was {s}')
+		self._need_next = True
+		return s
 
 	def __bool__(self) -> bool:
 		"""Returns True if stream has more lines."""
+		self._get_next()
 		return not self._done
 
 	# Only used by kunit_tool_test.py.
@@ -216,6 +225,7 @@ class LineStream:
 
 	def line_number(self) -> int:
 		"""Returns the line number of the current line."""
+		self._get_next()
 		return self._next[0]
 
 # Parsing helper methods:
diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py
index 1b93f1151fcf..2d4d9d2713f9 100755
--- a/tools/testing/kunit/kunit_tool_test.py
+++ b/tools/testing/kunit/kunit_tool_test.py
@@ -13,9 +13,10 @@ import tempfile, shutil # Handling test_tmpdir
 
 import itertools
 import json
+import os
 import signal
 import subprocess
-import os
+from typing import Iterable
 
 import kunit_config
 import kunit_parser
@@ -331,6 +332,45 @@ class KUnitParserTest(unittest.TestCase):
 				result.status)
 			self.assertEqual('kunit-resource-test', result.test.subtests[0].name)
 
+def line_stream_from_strs(strs: Iterable[str]) -> kunit_parser.LineStream:
+	return kunit_parser.LineStream(enumerate(strs, start=1))
+
+class LineStreamTest(unittest.TestCase):
+
+	def test_basic(self):
+		stream = line_stream_from_strs(['hello', 'world'])
+
+		self.assertTrue(stream, msg='Should be more input')
+		self.assertEqual(stream.line_number(), 1)
+		self.assertEqual(stream.peek(), 'hello')
+		self.assertEqual(stream.pop(), 'hello')
+
+		self.assertTrue(stream, msg='Should be more input')
+		self.assertEqual(stream.line_number(), 2)
+		self.assertEqual(stream.peek(), 'world')
+		self.assertEqual(stream.pop(), 'world')
+
+		self.assertFalse(stream, msg='Should be no more input')
+		with self.assertRaisesRegex(ValueError, 'LineStream: going past EOF'):
+			stream.pop()
+
+	def test_is_lazy(self):
+		called_times = 0
+		def generator():
+			nonlocal called_times
+			for i in range(1,5):
+				called_times += 1
+				yield called_times, str(called_times)
+
+		stream = kunit_parser.LineStream(generator())
+		self.assertEqual(called_times, 0)
+
+		self.assertEqual(stream.pop(), '1')
+		self.assertEqual(called_times, 1)
+
+		self.assertEqual(stream.pop(), '2')
+		self.assertEqual(called_times, 2)
+
 class LinuxSourceTreeTest(unittest.TestCase):
 
 	def setUp(self):
-- 
cgit v1.2.3


From 98978490ccf7071442a5cbfaaa1a957a89b0c98b Mon Sep 17 00:00:00 2001
From: Daniel Latypov <dlatypov@google.com>
Date: Fri, 5 Nov 2021 18:30:57 -0700
Subject: kunit: tool: move Kconfig read_from_file/parse_from_string to
 package-level

read_from_file() clears its `self` Kconfig object and parses a config
file.

It is a way to construct Kconfig objects more so than an operation on
Kconfig objects. This is reflected in the fact its only ever used as:
  kconfig = kunit_config.Kconfig()
  kconfig.read_from_file(path)

So clean this up and simplify callers by replacing it with
  kconfig = kunit_config.parse_file(path)

Do the same thing for the related parse_from_string() function as well.

Signed-off-by: Daniel Latypov <dlatypov@google.com>
Reviewed-by: David Gow <davidgow@google.com>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit_config.py    | 61 +++++++++++++++++-----------------
 tools/testing/kunit/kunit_kernel.py    | 12 +++----
 tools/testing/kunit/kunit_tool_test.py |  6 ++--
 3 files changed, 37 insertions(+), 42 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/kunit/kunit_config.py b/tools/testing/kunit/kunit_config.py
index c77c7d2ef622..677354546156 100644
--- a/tools/testing/kunit/kunit_config.py
+++ b/tools/testing/kunit/kunit_config.py
@@ -62,33 +62,34 @@ class Kconfig(object):
 			for entry in self.entries():
 				f.write(str(entry) + '\n')
 
-	def parse_from_string(self, blob: str) -> None:
-		"""Parses a string containing KconfigEntrys and populates this Kconfig."""
-		self._entries = []
-		is_not_set_matcher = re.compile(CONFIG_IS_NOT_SET_PATTERN)
-		config_matcher = re.compile(CONFIG_PATTERN)
-		for line in blob.split('\n'):
-			line = line.strip()
-			if not line:
-				continue
-
-			match = config_matcher.match(line)
-			if match:
-				entry = KconfigEntry(match.group(1), match.group(2))
-				self.add_entry(entry)
-				continue
-
-			empty_match = is_not_set_matcher.match(line)
-			if empty_match:
-				entry = KconfigEntry(empty_match.group(1), 'n')
-				self.add_entry(entry)
-				continue
-
-			if line[0] == '#':
-				continue
-			else:
-				raise KconfigParseError('Failed to parse: ' + line)
-
-	def read_from_file(self, path: str) -> None:
-		with open(path, 'r') as f:
-			self.parse_from_string(f.read())
+def parse_file(path: str) -> Kconfig:
+	with open(path, 'r') as f:
+		return parse_from_string(f.read())
+
+def parse_from_string(blob: str) -> Kconfig:
+	"""Parses a string containing Kconfig entries."""
+	kconfig = Kconfig()
+	is_not_set_matcher = re.compile(CONFIG_IS_NOT_SET_PATTERN)
+	config_matcher = re.compile(CONFIG_PATTERN)
+	for line in blob.split('\n'):
+		line = line.strip()
+		if not line:
+			continue
+
+		match = config_matcher.match(line)
+		if match:
+			entry = KconfigEntry(match.group(1), match.group(2))
+			kconfig.add_entry(entry)
+			continue
+
+		empty_match = is_not_set_matcher.match(line)
+		if empty_match:
+			entry = KconfigEntry(empty_match.group(1), 'n')
+			kconfig.add_entry(entry)
+			continue
+
+		if line[0] == '#':
+			continue
+		else:
+			raise KconfigParseError('Failed to parse: ' + line)
+	return kconfig
diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py
index 66095568bf32..51ee6e5dae91 100644
--- a/tools/testing/kunit/kunit_kernel.py
+++ b/tools/testing/kunit/kunit_kernel.py
@@ -116,8 +116,7 @@ class LinuxSourceTreeOperationsQemu(LinuxSourceTreeOperations):
 		self._extra_qemu_params = qemu_arch_params.extra_qemu_params
 
 	def make_arch_qemuconfig(self, base_kunitconfig: kunit_config.Kconfig) -> None:
-		kconfig = kunit_config.Kconfig()
-		kconfig.parse_from_string(self._kconfig)
+		kconfig = kunit_config.parse_from_string(self._kconfig)
 		base_kunitconfig.merge_in_entries(kconfig)
 
 	def start(self, params: List[str], build_dir: str) -> subprocess.Popen:
@@ -249,8 +248,7 @@ class LinuxSourceTree(object):
 			if not os.path.exists(kunitconfig_path):
 				shutil.copyfile(DEFAULT_KUNITCONFIG_PATH, kunitconfig_path)
 
-		self._kconfig = kunit_config.Kconfig()
-		self._kconfig.read_from_file(kunitconfig_path)
+		self._kconfig = kunit_config.parse_file(kunitconfig_path)
 
 	def clean(self) -> bool:
 		try:
@@ -262,8 +260,7 @@ class LinuxSourceTree(object):
 
 	def validate_config(self, build_dir) -> bool:
 		kconfig_path = get_kconfig_path(build_dir)
-		validated_kconfig = kunit_config.Kconfig()
-		validated_kconfig.read_from_file(kconfig_path)
+		validated_kconfig = kunit_config.parse_file(kconfig_path)
 		if not self._kconfig.is_subset_of(validated_kconfig):
 			invalid = self._kconfig.entries() - validated_kconfig.entries()
 			message = 'Provided Kconfig is not contained in validated .config. Following fields found in kunitconfig, ' \
@@ -291,8 +288,7 @@ class LinuxSourceTree(object):
 		"""Creates a new .config if it is not a subset of the .kunitconfig."""
 		kconfig_path = get_kconfig_path(build_dir)
 		if os.path.exists(kconfig_path):
-			existing_kconfig = kunit_config.Kconfig()
-			existing_kconfig.read_from_file(kconfig_path)
+			existing_kconfig = kunit_config.parse_file(kconfig_path)
 			self._ops.make_arch_qemuconfig(self._kconfig)
 			if not self._kconfig.is_subset_of(existing_kconfig):
 				print('Regenerating .config ...')
diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py
index 2d4d9d2713f9..d125d2380b38 100755
--- a/tools/testing/kunit/kunit_tool_test.py
+++ b/tools/testing/kunit/kunit_tool_test.py
@@ -51,10 +51,9 @@ class KconfigTest(unittest.TestCase):
 		self.assertFalse(kconfig1.is_subset_of(kconfig0))
 
 	def test_read_from_file(self):
-		kconfig = kunit_config.Kconfig()
 		kconfig_path = test_data_path('test_read_from_file.kconfig')
 
-		kconfig.read_from_file(kconfig_path)
+		kconfig = kunit_config.parse_file(kconfig_path)
 
 		expected_kconfig = kunit_config.Kconfig()
 		expected_kconfig.add_entry(
@@ -87,8 +86,7 @@ class KconfigTest(unittest.TestCase):
 
 		expected_kconfig.write_to_file(kconfig_path)
 
-		actual_kconfig = kunit_config.Kconfig()
-		actual_kconfig.read_from_file(kconfig_path)
+		actual_kconfig = kunit_config.parse_file(kconfig_path)
 
 		self.assertEqual(actual_kconfig.entries(),
 				 expected_kconfig.entries())
-- 
cgit v1.2.3


From 9f57cc76eccc1e0a369bb051c4b0d596e7d15e30 Mon Sep 17 00:00:00 2001
From: Daniel Latypov <dlatypov@google.com>
Date: Fri, 5 Nov 2021 18:30:58 -0700
Subject: kunit: tool: add --kconfig_add to allow easily tweaking kunitconfigs

E.g. run tests but with KASAN
$ ./tools/testing/kunit/kunit.py run --arch=x86_64 --kconfig_add=CONFIG_KASAN=y

This also works with --kunitconfig
$ ./tools/testing/kunit/kunit.py run --arch=x86_64 --kunitconfig=fs/ext4 --kconfig_add=CONFIG_KASAN=y

This flag is inspired by TuxMake's --kconfig-add, see
https://gitlab.com/Linaro/tuxmake#examples.

Our version just uses "_" as the delimiter for consistency with
pre-existing flags like --build_dir, --make_options, --kernel_args, etc.

Note: this does make it easier to run into a pre-existing edge case:
$ ./tools/testing/kunit/kunit.py run --arch=x86_64 --kconfig_add=CONFIG_KASAN=y
$ ./tools/testing/kunit/kunit.py run --arch=x86_64
This second invocation ^ still has KASAN enabled!

kunit.py won't call olddefconfig if our current .config is already a
superset of the provided kunitconfig.

Signed-off-by: Daniel Latypov <dlatypov@google.com>
Reviewed-by: David Gow <davidgow@google.com>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit.py           |  8 ++++++++
 tools/testing/kunit/kunit_kernel.py    |  5 +++++
 tools/testing/kunit/kunit_tool_test.py | 18 ++++++++++++++++++
 3 files changed, 31 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py
index 68e6f461c758..be58f4c93806 100755
--- a/tools/testing/kunit/kunit.py
+++ b/tools/testing/kunit/kunit.py
@@ -280,6 +280,10 @@ def add_common_opts(parser) -> None:
 			     ' If given a directory, (e.g. lib/kunit), "/.kunitconfig" '
 			     'will get  automatically appended.',
 			     metavar='kunitconfig')
+	parser.add_argument('--kconfig_add',
+			     help='Additional Kconfig options to append to the '
+			     '.kunitconfig, e.g. CONFIG_KASAN=y. Can be repeated.',
+			    action='append')
 
 	parser.add_argument('--arch',
 			    help=('Specifies the architecture to run tests under. '
@@ -398,6 +402,7 @@ def main(argv, linux=None):
 		if not linux:
 			linux = kunit_kernel.LinuxSourceTree(cli_args.build_dir,
 					kunitconfig_path=cli_args.kunitconfig,
+					kconfig_add=cli_args.kconfig_add,
 					arch=cli_args.arch,
 					cross_compile=cli_args.cross_compile,
 					qemu_config_path=cli_args.qemu_config)
@@ -423,6 +428,7 @@ def main(argv, linux=None):
 		if not linux:
 			linux = kunit_kernel.LinuxSourceTree(cli_args.build_dir,
 					kunitconfig_path=cli_args.kunitconfig,
+					kconfig_add=cli_args.kconfig_add,
 					arch=cli_args.arch,
 					cross_compile=cli_args.cross_compile,
 					qemu_config_path=cli_args.qemu_config)
@@ -439,6 +445,7 @@ def main(argv, linux=None):
 		if not linux:
 			linux = kunit_kernel.LinuxSourceTree(cli_args.build_dir,
 					kunitconfig_path=cli_args.kunitconfig,
+					kconfig_add=cli_args.kconfig_add,
 					arch=cli_args.arch,
 					cross_compile=cli_args.cross_compile,
 					qemu_config_path=cli_args.qemu_config)
@@ -457,6 +464,7 @@ def main(argv, linux=None):
 		if not linux:
 			linux = kunit_kernel.LinuxSourceTree(cli_args.build_dir,
 					kunitconfig_path=cli_args.kunitconfig,
+					kconfig_add=cli_args.kconfig_add,
 					arch=cli_args.arch,
 					cross_compile=cli_args.cross_compile,
 					qemu_config_path=cli_args.qemu_config)
diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py
index 51ee6e5dae91..7d459d6d6ff2 100644
--- a/tools/testing/kunit/kunit_kernel.py
+++ b/tools/testing/kunit/kunit_kernel.py
@@ -224,6 +224,7 @@ class LinuxSourceTree(object):
 	      build_dir: str,
 	      load_config=True,
 	      kunitconfig_path='',
+	      kconfig_add: Optional[List[str]]=None,
 	      arch=None,
 	      cross_compile=None,
 	      qemu_config_path=None) -> None:
@@ -249,6 +250,10 @@ class LinuxSourceTree(object):
 				shutil.copyfile(DEFAULT_KUNITCONFIG_PATH, kunitconfig_path)
 
 		self._kconfig = kunit_config.parse_file(kunitconfig_path)
+		if kconfig_add:
+			kconfig = kunit_config.parse_from_string('\n'.join(kconfig_add))
+			self._kconfig.merge_in_entries(kconfig)
+
 
 	def clean(self) -> bool:
 		try:
diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py
index d125d2380b38..c1982ace312b 100755
--- a/tools/testing/kunit/kunit_tool_test.py
+++ b/tools/testing/kunit/kunit_tool_test.py
@@ -389,6 +389,10 @@ class LinuxSourceTreeTest(unittest.TestCase):
 				pass
 			kunit_kernel.LinuxSourceTree('', kunitconfig_path=dir)
 
+	def test_kconfig_add(self):
+		tree = kunit_kernel.LinuxSourceTree('', kconfig_add=['CONFIG_NOT_REAL=y'])
+		self.assertIn(kunit_config.KconfigEntry('NOT_REAL', 'y'), tree._kconfig.entries())
+
 	def test_invalid_arch(self):
 		with self.assertRaisesRegex(kunit_kernel.ConfigError, 'not a valid arch, options are.*x86_64'):
 			kunit_kernel.LinuxSourceTree('', arch='invalid')
@@ -601,6 +605,7 @@ class KUnitMainTest(unittest.TestCase):
 		# Just verify that we parsed and initialized it correctly here.
 		mock_linux_init.assert_called_once_with('.kunit',
 							kunitconfig_path='mykunitconfig',
+							kconfig_add=None,
 							arch='um',
 							cross_compile=None,
 							qemu_config_path=None)
@@ -612,6 +617,19 @@ class KUnitMainTest(unittest.TestCase):
 		# Just verify that we parsed and initialized it correctly here.
 		mock_linux_init.assert_called_once_with('.kunit',
 							kunitconfig_path='mykunitconfig',
+							kconfig_add=None,
+							arch='um',
+							cross_compile=None,
+							qemu_config_path=None)
+
+	@mock.patch.object(kunit_kernel, 'LinuxSourceTree')
+	def test_run_kconfig_add(self, mock_linux_init):
+		mock_linux_init.return_value = self.linux_source_mock
+		kunit.main(['run', '--kconfig_add=CONFIG_KASAN=y', '--kconfig_add=CONFIG_KCSAN=y'])
+		# Just verify that we parsed and initialized it correctly here.
+		mock_linux_init.assert_called_once_with('.kunit',
+							kunitconfig_path=None,
+							kconfig_add=['CONFIG_KASAN=y', 'CONFIG_KCSAN=y'],
 							arch='um',
 							cross_compile=None,
 							qemu_config_path=None)
-- 
cgit v1.2.3


From c44895b6cd8517c022dcc3d6fd4cd9bea1184a06 Mon Sep 17 00:00:00 2001
From: Daniel Latypov <dlatypov@google.com>
Date: Thu, 11 Nov 2021 10:33:56 -0800
Subject: kunit: tool: revamp message for invalid kunitconfig

The current error message is precise, but not very clear if you don't
already know what it's talking about, e.g.

> $ make ARCH=um olddefconfig O=.kunit
> ERROR:root:Provided Kconfig is not contained in validated .config. Following fields found in kunitconfig, but not in .config: CONFIG_DRM=y

Try to reword the error message so that it's
* your missing options usually have unsatisified dependencies
* if you're on UML, that might be the cause (it is, in this example)

Signed-off-by: Daniel Latypov <dlatypov@google.com>
Reviewed-by: David Gow <davidgow@google.com>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit_kernel.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py
index 7d459d6d6ff2..350883672be0 100644
--- a/tools/testing/kunit/kunit_kernel.py
+++ b/tools/testing/kunit/kunit_kernel.py
@@ -266,15 +266,17 @@ class LinuxSourceTree(object):
 	def validate_config(self, build_dir) -> bool:
 		kconfig_path = get_kconfig_path(build_dir)
 		validated_kconfig = kunit_config.parse_file(kconfig_path)
-		if not self._kconfig.is_subset_of(validated_kconfig):
-			invalid = self._kconfig.entries() - validated_kconfig.entries()
-			message = 'Provided Kconfig is not contained in validated .config. Following fields found in kunitconfig, ' \
-					  'but not in .config: %s' % (
-					', '.join([str(e) for e in invalid])
-			)
-			logging.error(message)
-			return False
-		return True
+		if self._kconfig.is_subset_of(validated_kconfig):
+			return True
+		invalid = self._kconfig.entries() - validated_kconfig.entries()
+		message = 'Not all Kconfig options selected in kunitconfig were in the generated .config.\n' \
+			  'This is probably due to unsatisfied dependencies.\n' \
+			  'Missing: ' + ', '.join([str(e) for e in invalid])
+		if self._arch == 'um':
+			message += '\nNote: many Kconfig options aren\'t available on UML. You can try running ' \
+				   'on a different architecture with something like "--arch=x86_64".'
+		logging.error(message)
+		return False
 
 	def build_config(self, build_dir, make_options) -> bool:
 		kconfig_path = get_kconfig_path(build_dir)
-- 
cgit v1.2.3


From 4c2911f1e140b393d2c9920875911c992f8dda2c Mon Sep 17 00:00:00 2001
From: Daniel Latypov <dlatypov@google.com>
Date: Fri, 19 Nov 2021 15:23:16 -0800
Subject: kunit: tool: reconfigure when the used kunitconfig changes

Problem: currently, if you remove something from your kunitconfig,
kunit.py will not regenerate the .config file.
The same thing happens if you did --kunitconfig_add=CONFIG_KASAN=y [1]
and then ran again without it. Your new run will still have KASAN.

The reason is that kunit.py won't regenerate the .config file if it's a
superset of the kunitconfig. This speeds it up a bit for iterating.

This patch adds an additional check that forces kunit.py to regenerate
the .config file if the current kunitconfig doesn't match the previous
one.

What this means:
* deleting entries from .kunitconfig works as one would expect
* dropping  a --kunitconfig_add also triggers a rebuild
* you can still edit .config directly to turn on new options

We implement this by creating a `last_used_kunitconfig` file in the
build directory (so .kunit, by default) after we generate the .config.
When comparing the kconfigs, we compare python sets, so duplicates and
permutations don't trip us up.

The majority of this patch is adding unit tests for the existing logic
and for the new case where `last_used_kunitconfig` differs.

[1] https://lore.kernel.org/linux-kselftest/20211106013058.2621799-2-dlatypov@google.com/

Signed-off-by: Daniel Latypov <dlatypov@google.com>
Reviewed-by: David Gow <davidgow@google.com>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 Documentation/dev-tools/kunit/start.rst |  8 +++---
 tools/testing/kunit/kunit_kernel.py     | 40 +++++++++++++++++++++--------
 tools/testing/kunit/kunit_tool_test.py  | 45 +++++++++++++++++++++++++++++++++
 3 files changed, 78 insertions(+), 15 deletions(-)

(limited to 'tools')

diff --git a/Documentation/dev-tools/kunit/start.rst b/Documentation/dev-tools/kunit/start.rst
index 1e00f9226f74..0a5e65540974 100644
--- a/Documentation/dev-tools/kunit/start.rst
+++ b/Documentation/dev-tools/kunit/start.rst
@@ -50,10 +50,10 @@ It'll warn you if you haven't included the dependencies of the options you're
 using.
 
 .. note::
-   Note that removing something from the ``.kunitconfig`` will not trigger a
-   rebuild of the ``.config`` file: the configuration is only updated if the
-   ``.kunitconfig`` is not a subset of ``.config``. This means that you can use
-   other tools (such as make menuconfig) to adjust other config options.
+   If you change the ``.kunitconfig``, kunit.py will trigger a rebuild of the
+   ``.config`` file. But you can edit the ``.config`` file directly or with
+   tools like ``make menuconfig O=.kunit``. As long as its a superset of
+   ``.kunitconfig``, kunit.py won't overwrite your changes.
 
 
 Running the tests (KUnit Wrapper)
diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py
index 350883672be0..12085e04a80c 100644
--- a/tools/testing/kunit/kunit_kernel.py
+++ b/tools/testing/kunit/kunit_kernel.py
@@ -21,6 +21,7 @@ import qemu_config
 
 KCONFIG_PATH = '.config'
 KUNITCONFIG_PATH = '.kunitconfig'
+OLD_KUNITCONFIG_PATH = 'last_used_kunitconfig'
 DEFAULT_KUNITCONFIG_PATH = 'tools/testing/kunit/configs/default.config'
 BROKEN_ALLCONFIG_PATH = 'tools/testing/kunit/configs/broken_on_uml.config'
 OUTFILE_PATH = 'test.log'
@@ -179,6 +180,9 @@ def get_kconfig_path(build_dir) -> str:
 def get_kunitconfig_path(build_dir) -> str:
 	return get_file_path(build_dir, KUNITCONFIG_PATH)
 
+def get_old_kunitconfig_path(build_dir) -> str:
+	return get_file_path(build_dir, OLD_KUNITCONFIG_PATH)
+
 def get_outfile_path(build_dir) -> str:
 	return get_file_path(build_dir, OUTFILE_PATH)
 
@@ -289,24 +293,38 @@ class LinuxSourceTree(object):
 		except ConfigError as e:
 			logging.error(e)
 			return False
-		return self.validate_config(build_dir)
+		if not self.validate_config(build_dir):
+			return False
+
+		old_path = get_old_kunitconfig_path(build_dir)
+		if os.path.exists(old_path):
+			os.remove(old_path)  # write_to_file appends to the file
+		self._kconfig.write_to_file(old_path)
+		return True
+
+	def _kunitconfig_changed(self, build_dir: str) -> bool:
+		old_path = get_old_kunitconfig_path(build_dir)
+		if not os.path.exists(old_path):
+			return True
+
+		old_kconfig = kunit_config.parse_file(old_path)
+		return old_kconfig.entries() != self._kconfig.entries()
 
 	def build_reconfig(self, build_dir, make_options) -> bool:
 		"""Creates a new .config if it is not a subset of the .kunitconfig."""
 		kconfig_path = get_kconfig_path(build_dir)
-		if os.path.exists(kconfig_path):
-			existing_kconfig = kunit_config.parse_file(kconfig_path)
-			self._ops.make_arch_qemuconfig(self._kconfig)
-			if not self._kconfig.is_subset_of(existing_kconfig):
-				print('Regenerating .config ...')
-				os.remove(kconfig_path)
-				return self.build_config(build_dir, make_options)
-			else:
-				return True
-		else:
+		if not os.path.exists(kconfig_path):
 			print('Generating .config ...')
 			return self.build_config(build_dir, make_options)
 
+		existing_kconfig = kunit_config.parse_file(kconfig_path)
+		self._ops.make_arch_qemuconfig(self._kconfig)
+		if self._kconfig.is_subset_of(existing_kconfig) and not self._kunitconfig_changed(build_dir):
+			return True
+		print('Regenerating .config ...')
+		os.remove(kconfig_path)
+		return self.build_config(build_dir, make_options)
+
 	def build_kernel(self, alltests, jobs, build_dir, make_options) -> bool:
 		try:
 			if alltests:
diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py
index c1982ace312b..97a4c72af2b8 100755
--- a/tools/testing/kunit/kunit_tool_test.py
+++ b/tools/testing/kunit/kunit_tool_test.py
@@ -413,6 +413,51 @@ class LinuxSourceTreeTest(unittest.TestCase):
 			with open(kunit_kernel.get_outfile_path(build_dir), 'rt') as outfile:
 				self.assertEqual(outfile.read(), 'hi\nbye\n', msg='Missing some output')
 
+	def test_build_reconfig_no_config(self):
+		with tempfile.TemporaryDirectory('') as build_dir:
+			with open(kunit_kernel.get_kunitconfig_path(build_dir), 'w') as f:
+				f.write('CONFIG_KUNIT=y')
+
+			tree = kunit_kernel.LinuxSourceTree(build_dir)
+			mock_build_config = mock.patch.object(tree, 'build_config').start()
+
+			# Should generate the .config
+			self.assertTrue(tree.build_reconfig(build_dir, make_options=[]))
+			mock_build_config.assert_called_once_with(build_dir, [])
+
+	def test_build_reconfig_existing_config(self):
+		with tempfile.TemporaryDirectory('') as build_dir:
+			# Existing .config is a superset, should not touch it
+			with open(kunit_kernel.get_kunitconfig_path(build_dir), 'w') as f:
+				f.write('CONFIG_KUNIT=y')
+			with open(kunit_kernel.get_old_kunitconfig_path(build_dir), 'w') as f:
+				f.write('CONFIG_KUNIT=y')
+			with open(kunit_kernel.get_kconfig_path(build_dir), 'w') as f:
+				f.write('CONFIG_KUNIT=y\nCONFIG_KUNIT_TEST=y')
+
+			tree = kunit_kernel.LinuxSourceTree(build_dir)
+			mock_build_config = mock.patch.object(tree, 'build_config').start()
+
+			self.assertTrue(tree.build_reconfig(build_dir, make_options=[]))
+			self.assertEqual(mock_build_config.call_count, 0)
+
+	def test_build_reconfig_remove_option(self):
+		with tempfile.TemporaryDirectory('') as build_dir:
+			# We removed CONFIG_KUNIT_TEST=y from our .kunitconfig...
+			with open(kunit_kernel.get_kunitconfig_path(build_dir), 'w') as f:
+				f.write('CONFIG_KUNIT=y')
+			with open(kunit_kernel.get_old_kunitconfig_path(build_dir), 'w') as f:
+				f.write('CONFIG_KUNIT=y\nCONFIG_KUNIT_TEST=y')
+			with open(kunit_kernel.get_kconfig_path(build_dir), 'w') as f:
+				f.write('CONFIG_KUNIT=y\nCONFIG_KUNIT_TEST=y')
+
+			tree = kunit_kernel.LinuxSourceTree(build_dir)
+			mock_build_config = mock.patch.object(tree, 'build_config').start()
+
+			# ... so we should trigger a call to build_config()
+			self.assertTrue(tree.build_reconfig(build_dir, make_options=[]))
+			mock_build_config.assert_called_once_with(build_dir, [])
+
 	# TODO: add more test cases.
 
 
-- 
cgit v1.2.3


From 7fa7ffcf9babaea2f0a81681b4ef460ee4b93278 Mon Sep 17 00:00:00 2001
From: Daniel Latypov <dlatypov@google.com>
Date: Fri, 19 Nov 2021 19:24:01 -0800
Subject: kunit: tool: suggest using decode_stacktrace.sh on kernel crash

kunit.py isn't very clear that
1) it stashes a copy of the unparsed output in $BUILD_DIR/test.log
2) it sets $BUILD_DIR=.kunit by default

So it's trickier than it should be for a user to come up with the right
command to do so.

Make kunit.py print out a command for this if
a) we saw a test case crash
b) we only ran one kernel (test.log only contains output from the last)

Example suggested command:
$ scripts/decode_stacktrace.sh .kunit/vmlinux .kunit < .kunit/test.log | tee .kunit/decoded.log | ./tools/testing/kunit/kunit.py parse

Without debug info a user might see something like
[14:11:25] Call Trace:
[14:11:25] ? kunit_binary_assert_format (:?)
[14:11:25] kunit_try_run_case (test.c:?)
[14:11:25] ? __kthread_parkme (kthread.c:?)
[14:11:25] kunit_generic_run_threadfn_adapter (try-catch.c:?)
[14:11:25] ? kunit_generic_run_threadfn_adapter (try-catch.c:?)
[14:11:25] kthread (kthread.c:?)
[14:11:25] new_thread_handler (:?)
[14:11:25] [CRASHED]

`tee` is in GNU coreutils, so it seems fine to add that into the
pipeline by default, that way users can inspect the otuput in more
detail.

Note: to turn on debug info, users would need to do something like
$ echo -e 'CONFIG_DEBUG_KERNEL=y\nCONFIG_DEBUG_INFO=y' >> .kunit/.kunitconfig
$ ./tools/testing/kunit/kunit.py config
$ ./tools/testing/kunit/kunit.py build
$ <then run decode_stacktrace.sh now vmlinux is updated>

This feels too clunky to include in the instructions.
With --kconfig_add [1], it would become a bit less painful.

[1] https://lore.kernel.org/linux-kselftest/20211106013058.2621799-2-dlatypov@google.com/

Signed-off-by: Daniel Latypov <dlatypov@google.com>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit.py | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py
index be58f4c93806..8c52faf5787c 100755
--- a/tools/testing/kunit/kunit.py
+++ b/tools/testing/kunit/kunit.py
@@ -156,6 +156,12 @@ def exec_tests(linux: kunit_kernel.LinuxSourceTree, request: KunitExecRequest,
 
 		test_counts.add_subtest_counts(result.result.test.counts)
 
+	if len(filter_globs) == 1 and test_counts.crashed > 0:
+		bd = request.build_dir
+		print('The kernel seems to have crashed; you can decode the stack traces with:')
+		print('$ scripts/decode_stacktrace.sh {}/vmlinux {} < {} | tee {}/decoded.log | {} parse'.format(
+				bd, bd, kunit_kernel.get_outfile_path(bd), bd, sys.argv[0]))
+
 	kunit_status = _map_to_overall_status(test_counts.get_status())
 	return KunitResult(status=kunit_status, result=result.result, elapsed_time=exec_time)
 
-- 
cgit v1.2.3


From acd143eefb8270b8f50e2ce44e2fcdbbcf496a86 Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Mon, 13 Dec 2021 18:30:59 +0000
Subject: selftests/bpf: Fix segfault in bpf_tcp_ca

Since commit ad9a7f96445b ("libbpf: Improve logging around BPF program
loading"), libbpf_debug_print() gets an additional prog_name parameter
but doesn't pass it to printf(). Since the format string now expects two
arguments, printf() may read uninitialized data and segfault. Pass
prog_name through.

Fixes: ad9a7f96445b ("libbpf: Improve logging around BPF program loading")
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211213183058.346066-1-jean-philippe@linaro.org
---
 tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
index 8daca0ac909f..8f7a1cef7d87 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
@@ -217,7 +217,7 @@ static bool found;
 static int libbpf_debug_print(enum libbpf_print_level level,
 			      const char *format, va_list args)
 {
-	const char *log_buf;
+	const char *prog_name, *log_buf;
 
 	if (level != LIBBPF_WARN ||
 	    !strstr(format, "-- BEGIN PROG LOAD LOG --")) {
@@ -225,15 +225,14 @@ static int libbpf_debug_print(enum libbpf_print_level level,
 		return 0;
 	}
 
-	/* skip prog_name */
-	va_arg(args, char *);
+	prog_name = va_arg(args, char *);
 	log_buf = va_arg(args, char *);
 	if (!log_buf)
 		goto out;
 	if (err_str && strstr(log_buf, err_str) != NULL)
 		found = true;
 out:
-	printf(format, log_buf);
+	printf(format, prog_name, log_buf);
 	return 0;
 }
 
-- 
cgit v1.2.3


From f742fc68ac0da76d96e5713210b0aef771c1dd0f Mon Sep 17 00:00:00 2001
From: Grant Seltzer <grantseltzer@gmail.com>
Date: Thu, 9 Dec 2021 18:22:22 -0500
Subject: libbpf: Add doc comments for bpf_program__(un)pin()

This adds doc comments for the two bpf_program pinning functions,
bpf_program__pin() and bpf_program__unpin()

Signed-off-by: Grant Seltzer <grantseltzer@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211209232222.541733-1-grantseltzer@gmail.com
---
 tools/lib/bpf/libbpf.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index a8b894dae633..3f5ff2c09dfe 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -339,7 +339,31 @@ LIBBPF_DEPRECATED_SINCE(0, 7, "multi-instance bpf_program support is deprecated"
 LIBBPF_API int bpf_program__unpin_instance(struct bpf_program *prog,
 					   const char *path,
 					   int instance);
+
+/**
+ * @brief **bpf_program__pin()** pins the BPF program to a file
+ * in the BPF FS specified by a path. This increments the programs
+ * reference count, allowing it to stay loaded after the process
+ * which loaded it has exited.
+ *
+ * @param prog BPF program to pin, must already be loaded
+ * @param path file path in a BPF file system
+ * @return 0, on success; negative error code, otherwise
+ */
 LIBBPF_API int bpf_program__pin(struct bpf_program *prog, const char *path);
+
+/**
+ * @brief **bpf_program__unpin()** unpins the BPF program from a file
+ * in the BPFFS specified by a path. This decrements the programs
+ * reference count.
+ *
+ * The file pinning the BPF program can also be unlinked by a different
+ * process in which case this function will return an error.
+ *
+ * @param prog BPF program to unpin
+ * @param path file path to the pin in a BPF file system
+ * @return 0, on success; negative error code, otherwise
+ */
 LIBBPF_API int bpf_program__unpin(struct bpf_program *prog, const char *path);
 LIBBPF_API void bpf_program__unload(struct bpf_program *prog);
 
-- 
cgit v1.2.3


From 20617717cd219d3c1f798cd13dbce1bcd86a6ece Mon Sep 17 00:00:00 2001
From: Danielle Ratson <danieller@nvidia.com>
Date: Tue, 14 Dec 2021 12:21:37 +0200
Subject: selftests: mlxsw: Add a test case for MAC profiles consolidation

Add a test case to cover the bug fixed by the previous patch.

Edit the MAC address of one netdev so that it matches the MAC address of
the second netdev. Verify that the two MAC profiles were consolidated by
testing that the MAC profiles occupancy decreased by one.

Signed-off-by: Danielle Ratson <danieller@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../drivers/net/mlxsw/rif_mac_profiles_occ.sh      | 30 ++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/mlxsw/rif_mac_profiles_occ.sh b/tools/testing/selftests/drivers/net/mlxsw/rif_mac_profiles_occ.sh
index b513f64d9092..026a126f584d 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/rif_mac_profiles_occ.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/rif_mac_profiles_occ.sh
@@ -72,6 +72,35 @@ rif_mac_profile_replacement_test()
 	ip link set $h1.10 address $h1_10_mac
 }
 
+rif_mac_profile_consolidation_test()
+{
+	local count=$1; shift
+	local h1_20_mac
+
+	RET=0
+
+	if [[ $count -eq 1 ]]; then
+		return
+	fi
+
+	h1_20_mac=$(mac_get $h1.20)
+
+	# Set the MAC of $h1.20 to that of $h1.10 and confirm that they are
+	# using the same MAC profile.
+	ip link set $h1.20 address 00:11:11:11:11:11
+	check_err $?
+
+	occ=$(devlink -j resource show $DEVLINK_DEV \
+	      | jq '.[][][] | select(.name=="rif_mac_profiles") |.["occ"]')
+
+	[[ $occ -eq $((count - 1)) ]]
+	check_err $? "MAC profile occupancy did not decrease"
+
+	log_test "RIF MAC profile consolidation"
+
+	ip link set $h1.20 address $h1_20_mac
+}
+
 rif_mac_profile_shared_replacement_test()
 {
 	local count=$1; shift
@@ -104,6 +133,7 @@ rif_mac_profile_edit_test()
 	create_max_rif_mac_profiles $count
 
 	rif_mac_profile_replacement_test
+	rif_mac_profile_consolidation_test $count
 	rif_mac_profile_shared_replacement_test $count
 }
 
-- 
cgit v1.2.3


From f7abc4c8df8c7930d0b9c56d9abee9a1fca635e9 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Tue, 14 Dec 2021 07:18:00 +0530
Subject: selftests/bpf: Fix OOB write in test_verifier

The commit referenced below added fixup_map_timer support (to create a
BPF map containing timers), but failed to increase the size of the
map_fds array, leading to out of bounds write. Fix this by changing
MAX_NR_MAPS to 22.

Fixes: e60e6962c503 ("selftests/bpf: Add tests for restricted helpers")
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211214014800.78762-1-memxor@gmail.com
---
 tools/testing/selftests/bpf/test_verifier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 465ef3f112c0..d3bf83d5c6cf 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -54,7 +54,7 @@
 #define MAX_INSNS	BPF_MAXINSNS
 #define MAX_TEST_INSNS	1000000
 #define MAX_FIXUPS	8
-#define MAX_NR_MAPS	21
+#define MAX_NR_MAPS	22
 #define MAX_TEST_RUNS	8
 #define POINTER_VALUE	0xcafe4all
 #define TEST_DATA_LEN	64
-- 
cgit v1.2.3


From 4581e676d3be9d8c921a48bf18e92c5a8f31bf13 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 13 Dec 2021 17:00:32 -0800
Subject: libbpf: Fix potential uninit memory read

In case of BPF_CORE_TYPE_ID_LOCAL we fill out target result explicitly.
But targ_res itself isn't initialized in such a case, and subsequent
call to bpf_core_patch_insn() might read uninitialized field (like
fail_memsz_adjust in this case). So ensure that targ_res is
zero-initialized for BPF_CORE_TYPE_ID_LOCAL case.

This was reported by Coverity static analyzer.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211214010032.3843804-1-andrii@kernel.org
---
 tools/lib/bpf/relo_core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/relo_core.c b/tools/lib/bpf/relo_core.c
index c770483b4c36..910865e29edc 100644
--- a/tools/lib/bpf/relo_core.c
+++ b/tools/lib/bpf/relo_core.c
@@ -1223,6 +1223,7 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn,
 	/* TYPE_ID_LOCAL relo is special and doesn't need candidate search */
 	if (relo->kind == BPF_CORE_TYPE_ID_LOCAL) {
 		/* bpf_insn's imm value could get out of sync during linking */
+		memset(&targ_res, 0, sizeof(targ_res));
 		targ_res.validate = false;
 		targ_res.poison = false;
 		targ_res.orig_val = local_spec->root_type_id;
-- 
cgit v1.2.3


From 9fc205b413b3f3e9502fa92151fba63b91230454 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 10 Dec 2021 16:40:43 -0800
Subject: libbpf: Add sane strncpy alternative and use it internally

strncpy() has a notoriously error-prone semantics which makes GCC
complain about it a lot (and quite often completely completely falsely
at that). Instead of pleasing GCC all the time (-Wno-stringop-truncation
is unfortunately only supported by GCC, so it's a bit too messy to just
enable it in Makefile), add libbpf-internal libbpf_strlcpy() helper
which follows what FreeBSD's strlcpy() does and what most people would
expect from strncpy(): copies up to N-1 first bytes from source string
into destination string and ensures zero-termination afterwards.

Replace all the relevant uses of strncpy/strncat/memcpy in libbpf with
libbpf_strlcpy().

This also fixes the issue reported by Emmanuel Deloget in xsk.c where
memcpy() could access source string beyond its end.

Fixes: 2f6324a3937f8 (libbpf: Support shared umems between queues and devices)
Reported-by: Emmanuel Deloget <emmanuel.deloget@eho.link>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211211004043.2374068-1-andrii@kernel.org
---
 tools/lib/bpf/bpf.c             |  4 ++--
 tools/lib/bpf/btf_dump.c        |  4 ++--
 tools/lib/bpf/gen_loader.c      |  6 ++----
 tools/lib/bpf/libbpf.c          |  8 +++-----
 tools/lib/bpf/libbpf_internal.h | 19 +++++++++++++++++++
 tools/lib/bpf/xsk.c             |  9 +++------
 6 files changed, 31 insertions(+), 19 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 6b2407e12060..1f84d706eb3e 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -112,7 +112,7 @@ int bpf_map_create(enum bpf_map_type map_type,
 
 	attr.map_type = map_type;
 	if (map_name)
-		strncat(attr.map_name, map_name, sizeof(attr.map_name) - 1);
+		libbpf_strlcpy(attr.map_name, map_name, sizeof(attr.map_name));
 	attr.key_size = key_size;
 	attr.value_size = value_size;
 	attr.max_entries = max_entries;
@@ -271,7 +271,7 @@ int bpf_prog_load_v0_6_0(enum bpf_prog_type prog_type,
 	attr.kern_version = OPTS_GET(opts, kern_version, 0);
 
 	if (prog_name)
-		strncat(attr.prog_name, prog_name, sizeof(attr.prog_name) - 1);
+		libbpf_strlcpy(attr.prog_name, prog_name, sizeof(attr.prog_name));
 	attr.license = ptr_to_u64(license);
 
 	if (insn_cnt > UINT_MAX)
diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c
index f06a1d343c92..b9a3260c83cb 100644
--- a/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c
@@ -2321,8 +2321,8 @@ int btf_dump__dump_type_data(struct btf_dump *d, __u32 id,
 	if (!opts->indent_str)
 		d->typed_dump->indent_str[0] = '\t';
 	else
-		strncat(d->typed_dump->indent_str, opts->indent_str,
-			sizeof(d->typed_dump->indent_str) - 1);
+		libbpf_strlcpy(d->typed_dump->indent_str, opts->indent_str,
+			       sizeof(d->typed_dump->indent_str));
 
 	d->typed_dump->compact = OPTS_GET(opts, compact, false);
 	d->typed_dump->skip_names = OPTS_GET(opts, skip_names, false);
diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c
index c5859c21d3e9..8ecef1088ba2 100644
--- a/tools/lib/bpf/gen_loader.c
+++ b/tools/lib/bpf/gen_loader.c
@@ -463,8 +463,7 @@ void bpf_gen__map_create(struct bpf_gen *gen,
 	attr.map_flags = map_attr->map_flags;
 	attr.map_extra = map_attr->map_extra;
 	if (map_name)
-		memcpy(attr.map_name, map_name,
-		       min((unsigned)strlen(map_name), BPF_OBJ_NAME_LEN - 1));
+		libbpf_strlcpy(attr.map_name, map_name, sizeof(attr.map_name));
 	attr.numa_node = map_attr->numa_node;
 	attr.map_ifindex = map_attr->map_ifindex;
 	attr.max_entries = max_entries;
@@ -970,8 +969,7 @@ void bpf_gen__prog_load(struct bpf_gen *gen,
 	core_relos = add_data(gen, gen->core_relos,
 			     attr.core_relo_cnt * attr.core_relo_rec_size);
 
-	memcpy(attr.prog_name, prog_name,
-	       min((unsigned)strlen(prog_name), BPF_OBJ_NAME_LEN - 1));
+	libbpf_strlcpy(attr.prog_name, prog_name, sizeof(attr.prog_name));
 	prog_load_attr = add_data(gen, &attr, attr_size);
 
 	/* populate union bpf_attr with a pointer to license */
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 902f1ad5b7e6..c6d84b01a59e 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1201,12 +1201,10 @@ static struct bpf_object *bpf_object__new(const char *path,
 
 	strcpy(obj->path, path);
 	if (obj_name) {
-		strncpy(obj->name, obj_name, sizeof(obj->name) - 1);
-		obj->name[sizeof(obj->name) - 1] = 0;
+		libbpf_strlcpy(obj->name, obj_name, sizeof(obj->name));
 	} else {
 		/* Using basename() GNU version which doesn't modify arg. */
-		strncpy(obj->name, basename((void *)path),
-			sizeof(obj->name) - 1);
+		libbpf_strlcpy(obj->name, basename((void *)path), sizeof(obj->name));
 		end = strchr(obj->name, '.');
 		if (end)
 			*end = 0;
@@ -1358,7 +1356,7 @@ static int bpf_object__check_endianness(struct bpf_object *obj)
 static int
 bpf_object__init_license(struct bpf_object *obj, void *data, size_t size)
 {
-	memcpy(obj->license, data, min(size, sizeof(obj->license) - 1));
+	libbpf_strlcpy(obj->license, data, sizeof(obj->license));
 	pr_debug("license of %s is %s\n", obj->path, obj->license);
 	return 0;
 }
diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index 355c41019aed..5e8166a2f3d8 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -169,6 +169,25 @@ static inline void *libbpf_reallocarray(void *ptr, size_t nmemb, size_t size)
 	return realloc(ptr, total);
 }
 
+/* Copy up to sz - 1 bytes from zero-terminated src string and ensure that dst
+ * is zero-terminated string no matter what (unless sz == 0, in which case
+ * it's a no-op). It's conceptually close to FreeBSD's strlcpy(), but differs
+ * in what is returned. Given this is internal helper, it's trivial to extend
+ * this, when necessary. Use this instead of strncpy inside libbpf source code.
+ */
+static inline void libbpf_strlcpy(char *dst, const char *src, size_t sz)
+{
+	size_t i;
+
+	if (sz == 0)
+		return;
+
+	sz--;
+	for (i = 0; i < sz && src[i]; i++)
+		dst[i] = src[i];
+	dst[i] = '\0';
+}
+
 struct btf;
 struct btf_type;
 
diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
index e8d94c6dd3bc..edafe56664f3 100644
--- a/tools/lib/bpf/xsk.c
+++ b/tools/lib/bpf/xsk.c
@@ -548,8 +548,7 @@ static int xsk_get_max_queues(struct xsk_socket *xsk)
 		return -errno;
 
 	ifr.ifr_data = (void *)&channels;
-	memcpy(ifr.ifr_name, ctx->ifname, IFNAMSIZ - 1);
-	ifr.ifr_name[IFNAMSIZ - 1] = '\0';
+	libbpf_strlcpy(ifr.ifr_name, ctx->ifname, IFNAMSIZ);
 	err = ioctl(fd, SIOCETHTOOL, &ifr);
 	if (err && errno != EOPNOTSUPP) {
 		ret = -errno;
@@ -768,8 +767,7 @@ static int xsk_create_xsk_struct(int ifindex, struct xsk_socket *xsk)
 	}
 
 	ctx->ifindex = ifindex;
-	memcpy(ctx->ifname, ifname, IFNAMSIZ -1);
-	ctx->ifname[IFNAMSIZ - 1] = 0;
+	libbpf_strlcpy(ctx->ifname, ifname, IFNAMSIZ);
 
 	xsk->ctx = ctx;
 	xsk->ctx->has_bpf_link = xsk_probe_bpf_link();
@@ -951,8 +949,7 @@ static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk,
 	ctx->refcount = 1;
 	ctx->umem = umem;
 	ctx->queue_id = queue_id;
-	memcpy(ctx->ifname, ifname, IFNAMSIZ - 1);
-	ctx->ifname[IFNAMSIZ - 1] = '\0';
+	libbpf_strlcpy(ctx->ifname, ifname, IFNAMSIZ);
 
 	ctx->fill = fill;
 	ctx->comp = comp;
-- 
cgit v1.2.3


From 18edbb6b3259332b577c2b92b6b4d8dfd86ff00e Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 10 Dec 2021 18:41:00 +0000
Subject: kselftest/arm64: Parameterise ptrace vector length information

SME introduces a new mode called streaming mode in which the SVE registers
have a different vector length. Since the ptrace interface for this is
based on the existing SVE interface prepare for supporting this by moving
the regset specific configuration into  struct and passing that around,
allowing these tests to be reused for streaming mode. As we will also have
to verify the interoperation of the SVE and streaming SVE regsets don't
just iterate over an array.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20211210184133.320748-5-broonie@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/sve-ptrace.c | 219 +++++++++++++++++---------
 1 file changed, 142 insertions(+), 77 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c
index c4417bc48d4f..af798b9d232c 100644
--- a/tools/testing/selftests/arm64/fp/sve-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c
@@ -21,16 +21,37 @@
 
 #include "../../kselftest.h"
 
-#define VL_TESTS (((SVE_VQ_MAX - SVE_VQ_MIN) + 1) * 3)
-#define FPSIMD_TESTS 5
-
-#define EXPECTED_TESTS (VL_TESTS + FPSIMD_TESTS)
+#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0]))
 
 /* <linux/elf.h> and <sys/auxv.h> don't like each other, so: */
 #ifndef NT_ARM_SVE
 #define NT_ARM_SVE 0x405
 #endif
 
+struct vec_type {
+	const char *name;
+	unsigned long hwcap_type;
+	unsigned long hwcap;
+	int regset;
+	int prctl_set;
+};
+
+static const struct vec_type vec_types[] = {
+	{
+		.name = "SVE",
+		.hwcap_type = AT_HWCAP,
+		.hwcap = HWCAP_SVE,
+		.regset = NT_ARM_SVE,
+		.prctl_set = PR_SVE_SET_VL,
+	},
+};
+
+#define VL_TESTS (((SVE_VQ_MAX - SVE_VQ_MIN) + 1) * 3)
+#define FLAG_TESTS 2
+#define FPSIMD_TESTS 3
+
+#define EXPECTED_TESTS ((VL_TESTS + FLAG_TESTS + FPSIMD_TESTS) * ARRAY_SIZE(vec_types))
+
 static void fill_buf(char *buf, size_t size)
 {
 	int i;
@@ -59,7 +80,8 @@ static int get_fpsimd(pid_t pid, struct user_fpsimd_state *fpsimd)
 	return ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov);
 }
 
-static struct user_sve_header *get_sve(pid_t pid, void **buf, size_t *size)
+static struct user_sve_header *get_sve(pid_t pid, const struct vec_type *type,
+				       void **buf, size_t *size)
 {
 	struct user_sve_header *sve;
 	void *p;
@@ -80,7 +102,7 @@ static struct user_sve_header *get_sve(pid_t pid, void **buf, size_t *size)
 
 		iov.iov_base = *buf;
 		iov.iov_len = sz;
-		if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_SVE, &iov))
+		if (ptrace(PTRACE_GETREGSET, pid, type->regset, &iov))
 			goto error;
 
 		sve = *buf;
@@ -96,17 +118,18 @@ error:
 	return NULL;
 }
 
-static int set_sve(pid_t pid, const struct user_sve_header *sve)
+static int set_sve(pid_t pid, const struct vec_type *type,
+		   const struct user_sve_header *sve)
 {
 	struct iovec iov;
 
 	iov.iov_base = (void *)sve;
 	iov.iov_len = sve->size;
-	return ptrace(PTRACE_SETREGSET, pid, NT_ARM_SVE, &iov);
+	return ptrace(PTRACE_SETREGSET, pid, type->regset, &iov);
 }
 
 /* Validate setting and getting the inherit flag */
-static void ptrace_set_get_inherit(pid_t child)
+static void ptrace_set_get_inherit(pid_t child, const struct vec_type *type)
 {
 	struct user_sve_header sve;
 	struct user_sve_header *new_sve = NULL;
@@ -118,9 +141,10 @@ static void ptrace_set_get_inherit(pid_t child)
 	sve.size = sizeof(sve);
 	sve.vl = sve_vl_from_vq(SVE_VQ_MIN);
 	sve.flags = SVE_PT_VL_INHERIT;
-	ret = set_sve(child, &sve);
+	ret = set_sve(child, type, &sve);
 	if (ret != 0) {
-		ksft_test_result_fail("Failed to set SVE_PT_VL_INHERIT\n");
+		ksft_test_result_fail("Failed to set %s SVE_PT_VL_INHERIT\n",
+				      type->name);
 		return;
 	}
 
@@ -128,35 +152,39 @@ static void ptrace_set_get_inherit(pid_t child)
 	 * Read back the new register state and verify that we have
 	 * set the flags we expected.
 	 */
-	if (!get_sve(child, (void **)&new_sve, &new_sve_size)) {
-		ksft_test_result_fail("Failed to read SVE flags\n");
+	if (!get_sve(child, type, (void **)&new_sve, &new_sve_size)) {
+		ksft_test_result_fail("Failed to read %s SVE flags\n",
+				      type->name);
 		return;
 	}
 
 	ksft_test_result(new_sve->flags & SVE_PT_VL_INHERIT,
-			 "SVE_PT_VL_INHERIT set\n");
+			 "%s SVE_PT_VL_INHERIT set\n", type->name);
 
 	/* Now clear */
 	sve.flags &= ~SVE_PT_VL_INHERIT;
-	ret = set_sve(child, &sve);
+	ret = set_sve(child, type, &sve);
 	if (ret != 0) {
-		ksft_test_result_fail("Failed to clear SVE_PT_VL_INHERIT\n");
+		ksft_test_result_fail("Failed to clear %s SVE_PT_VL_INHERIT\n",
+				      type->name);
 		return;
 	}
 
-	if (!get_sve(child, (void **)&new_sve, &new_sve_size)) {
-		ksft_test_result_fail("Failed to read SVE flags\n");
+	if (!get_sve(child, type, (void **)&new_sve, &new_sve_size)) {
+		ksft_test_result_fail("Failed to read %s SVE flags\n",
+				      type->name);
 		return;
 	}
 
 	ksft_test_result(!(new_sve->flags & SVE_PT_VL_INHERIT),
-			 "SVE_PT_VL_INHERIT cleared\n");
+			 "%s SVE_PT_VL_INHERIT cleared\n", type->name);
 
 	free(new_sve);
 }
 
 /* Validate attempting to set the specfied VL via ptrace */
-static void ptrace_set_get_vl(pid_t child, unsigned int vl, bool *supported)
+static void ptrace_set_get_vl(pid_t child, const struct vec_type *type,
+			      unsigned int vl, bool *supported)
 {
 	struct user_sve_header sve;
 	struct user_sve_header *new_sve = NULL;
@@ -166,10 +194,10 @@ static void ptrace_set_get_vl(pid_t child, unsigned int vl, bool *supported)
 	*supported = false;
 
 	/* Check if the VL is supported in this process */
-	prctl_vl = prctl(PR_SVE_SET_VL, vl);
+	prctl_vl = prctl(type->prctl_set, vl);
 	if (prctl_vl == -1)
-		ksft_exit_fail_msg("prctl(PR_SVE_SET_VL) failed: %s (%d)\n",
-				   strerror(errno), errno);
+		ksft_exit_fail_msg("prctl(PR_%s_SET_VL) failed: %s (%d)\n",
+				   type->name, strerror(errno), errno);
 
 	/* If the VL is not supported then a supported VL will be returned */
 	*supported = (prctl_vl == vl);
@@ -178,9 +206,10 @@ static void ptrace_set_get_vl(pid_t child, unsigned int vl, bool *supported)
 	memset(&sve, 0, sizeof(sve));
 	sve.size = sizeof(sve);
 	sve.vl = vl;
-	ret = set_sve(child, &sve);
+	ret = set_sve(child, type, &sve);
 	if (ret != 0) {
-		ksft_test_result_fail("Failed to set VL %u\n", vl);
+		ksft_test_result_fail("Failed to set %s VL %u\n",
+				      type->name, vl);
 		return;
 	}
 
@@ -188,12 +217,14 @@ static void ptrace_set_get_vl(pid_t child, unsigned int vl, bool *supported)
 	 * Read back the new register state and verify that we have the
 	 * same VL that we got from prctl() on ourselves.
 	 */
-	if (!get_sve(child, (void **)&new_sve, &new_sve_size)) {
-		ksft_test_result_fail("Failed to read VL %u\n", vl);
+	if (!get_sve(child, type, (void **)&new_sve, &new_sve_size)) {
+		ksft_test_result_fail("Failed to read %s VL %u\n",
+				      type->name, vl);
 		return;
 	}
 
-	ksft_test_result(new_sve->vl = prctl_vl, "Set VL %u\n", vl);
+	ksft_test_result(new_sve->vl = prctl_vl, "Set %s VL %u\n",
+			 type->name, vl);
 
 	free(new_sve);
 }
@@ -209,7 +240,7 @@ static void check_u32(unsigned int vl, const char *reg,
 }
 
 /* Access the FPSIMD registers via the SVE regset */
-static void ptrace_sve_fpsimd(pid_t child)
+static void ptrace_sve_fpsimd(pid_t child, const struct vec_type *type)
 {
 	void *svebuf = NULL;
 	size_t svebufsz = 0;
@@ -219,17 +250,18 @@ static void ptrace_sve_fpsimd(pid_t child)
 	unsigned char *p;
 
 	/* New process should start with FPSIMD registers only */
-	sve = get_sve(child, &svebuf, &svebufsz);
+	sve = get_sve(child, type, &svebuf, &svebufsz);
 	if (!sve) {
-		ksft_test_result_fail("get_sve: %s\n", strerror(errno));
+		ksft_test_result_fail("get_sve(%s): %s\n",
+				      type->name, strerror(errno));
 
 		return;
 	} else {
-		ksft_test_result_pass("get_sve(FPSIMD)\n");
+		ksft_test_result_pass("get_sve(%s FPSIMD)\n", type->name);
 	}
 
 	ksft_test_result((sve->flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD,
-			 "Set FPSIMD registers\n");
+			 "Set FPSIMD registers via %s\n", type->name);
 	if ((sve->flags & SVE_PT_REGS_MASK) != SVE_PT_REGS_FPSIMD)
 		goto out;
 
@@ -243,9 +275,9 @@ static void ptrace_sve_fpsimd(pid_t child)
 			p[j] = j;
 	}
 
-	if (set_sve(child, sve)) {
-		ksft_test_result_fail("set_sve(FPSIMD): %s\n",
-				      strerror(errno));
+	if (set_sve(child, type, sve)) {
+		ksft_test_result_fail("set_sve(%s FPSIMD): %s\n",
+				      type->name, strerror(errno));
 
 		goto out;
 	}
@@ -257,16 +289,20 @@ static void ptrace_sve_fpsimd(pid_t child)
 		goto out;
 	}
 	if (memcmp(fpsimd, &new_fpsimd, sizeof(*fpsimd)) == 0)
-		ksft_test_result_pass("get_fpsimd() gave same state\n");
+		ksft_test_result_pass("%s get_fpsimd() gave same state\n",
+				      type->name);
 	else
-		ksft_test_result_fail("get_fpsimd() gave different state\n");
+		ksft_test_result_fail("%s get_fpsimd() gave different state\n",
+				      type->name);
 
 out:
 	free(svebuf);
 }
 
 /* Validate attempting to set SVE data and read SVE data */
-static void ptrace_set_sve_get_sve_data(pid_t child, unsigned int vl)
+static void ptrace_set_sve_get_sve_data(pid_t child,
+					const struct vec_type *type,
+					unsigned int vl)
 {
 	void *write_buf;
 	void *read_buf = NULL;
@@ -281,8 +317,8 @@ static void ptrace_set_sve_get_sve_data(pid_t child, unsigned int vl)
 	data_size = SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, SVE_PT_REGS_SVE);
 	write_buf = malloc(data_size);
 	if (!write_buf) {
-		ksft_test_result_fail("Error allocating %d byte buffer for VL %u\n",
-				      data_size, vl);
+		ksft_test_result_fail("Error allocating %d byte buffer for %s VL %u\n",
+				      data_size, type->name, vl);
 		return;
 	}
 	write_sve = write_buf;
@@ -306,23 +342,26 @@ static void ptrace_set_sve_get_sve_data(pid_t child, unsigned int vl)
 
 	/* TODO: Generate a valid FFR pattern */
 
-	ret = set_sve(child, write_sve);
+	ret = set_sve(child, type, write_sve);
 	if (ret != 0) {
-		ksft_test_result_fail("Failed to set VL %u data\n", vl);
+		ksft_test_result_fail("Failed to set %s VL %u data\n",
+				      type->name, vl);
 		goto out;
 	}
 
 	/* Read the data back */
-	if (!get_sve(child, (void **)&read_buf, &read_sve_size)) {
-		ksft_test_result_fail("Failed to read VL %u data\n", vl);
+	if (!get_sve(child, type, (void **)&read_buf, &read_sve_size)) {
+		ksft_test_result_fail("Failed to read %s VL %u data\n",
+				      type->name, vl);
 		goto out;
 	}
 	read_sve = read_buf;
 
 	/* We might read more data if there's extensions we don't know */
 	if (read_sve->size < write_sve->size) {
-		ksft_test_result_fail("Wrote %d bytes, only read %d\n",
-				      write_sve->size, read_sve->size);
+		ksft_test_result_fail("%s wrote %d bytes, only read %d\n",
+				      type->name, write_sve->size,
+				      read_sve->size);
 		goto out_read;
 	}
 
@@ -349,7 +388,8 @@ static void ptrace_set_sve_get_sve_data(pid_t child, unsigned int vl)
 	check_u32(vl, "FPCR", write_buf + SVE_PT_SVE_FPCR_OFFSET(vq),
 		  read_buf + SVE_PT_SVE_FPCR_OFFSET(vq), &errors);
 
-	ksft_test_result(errors == 0, "Set and get SVE data for VL %u\n", vl);
+	ksft_test_result(errors == 0, "Set and get %s data for VL %u\n",
+			 type->name, vl);
 
 out_read:
 	free(read_buf);
@@ -358,7 +398,9 @@ out:
 }
 
 /* Validate attempting to set SVE data and read SVE data */
-static void ptrace_set_sve_get_fpsimd_data(pid_t child, unsigned int vl)
+static void ptrace_set_sve_get_fpsimd_data(pid_t child,
+					   const struct vec_type *type,
+					   unsigned int vl)
 {
 	void *write_buf;
 	struct user_sve_header *write_sve;
@@ -376,8 +418,8 @@ static void ptrace_set_sve_get_fpsimd_data(pid_t child, unsigned int vl)
 	data_size = SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, SVE_PT_REGS_SVE);
 	write_buf = malloc(data_size);
 	if (!write_buf) {
-		ksft_test_result_fail("Error allocating %d byte buffer for VL %u\n",
-				      data_size, vl);
+		ksft_test_result_fail("Error allocating %d byte buffer for %s VL %u\n",
+				      data_size, type->name, vl);
 		return;
 	}
 	write_sve = write_buf;
@@ -395,16 +437,17 @@ static void ptrace_set_sve_get_fpsimd_data(pid_t child, unsigned int vl)
 	fill_buf(write_buf + SVE_PT_SVE_FPSR_OFFSET(vq), SVE_PT_SVE_FPSR_SIZE);
 	fill_buf(write_buf + SVE_PT_SVE_FPCR_OFFSET(vq), SVE_PT_SVE_FPCR_SIZE);
 
-	ret = set_sve(child, write_sve);
+	ret = set_sve(child, type, write_sve);
 	if (ret != 0) {
-		ksft_test_result_fail("Failed to set VL %u data\n", vl);
+		ksft_test_result_fail("Failed to set %s VL %u data\n",
+				      type->name, vl);
 		goto out;
 	}
 
 	/* Read the data back */
 	if (get_fpsimd(child, &fpsimd_state)) {
-		ksft_test_result_fail("Failed to read VL %u FPSIMD data\n",
-				      vl);
+		ksft_test_result_fail("Failed to read %s VL %u FPSIMD data\n",
+				      type->name, vl);
 		goto out;
 	}
 
@@ -419,7 +462,8 @@ static void ptrace_set_sve_get_fpsimd_data(pid_t child, unsigned int vl)
 		       sizeof(tmp));
 
 		if (tmp != fpsimd_state.vregs[i]) {
-			printf("# Mismatch in FPSIMD for VL %u Z%d\n", vl, i);
+			printf("# Mismatch in FPSIMD for %s VL %u Z%d\n",
+			       type->name, vl, i);
 			errors++;
 		}
 	}
@@ -429,8 +473,8 @@ static void ptrace_set_sve_get_fpsimd_data(pid_t child, unsigned int vl)
 	check_u32(vl, "FPCR", write_buf + SVE_PT_SVE_FPCR_OFFSET(vq),
 		  &fpsimd_state.fpcr, &errors);
 
-	ksft_test_result(errors == 0, "Set and get FPSIMD data for VL %u\n",
-			 vl);
+	ksft_test_result(errors == 0, "Set and get FPSIMD data for %s VL %u\n",
+			 type->name, vl);
 
 out:
 	free(write_buf);
@@ -440,7 +484,7 @@ static int do_parent(pid_t child)
 {
 	int ret = EXIT_FAILURE;
 	pid_t pid;
-	int status;
+	int status, i;
 	siginfo_t si;
 	unsigned int vq, vl;
 	bool vl_supported;
@@ -499,26 +543,47 @@ static int do_parent(pid_t child)
 		}
 	}
 
-	/* FPSIMD via SVE regset */
-	ptrace_sve_fpsimd(child);
-
-	/* prctl() flags */
-	ptrace_set_get_inherit(child);
-
-	/* Step through every possible VQ */
-	for (vq = SVE_VQ_MIN; vq <= SVE_VQ_MAX; vq++) {
-		vl = sve_vl_from_vq(vq);
+	for (i = 0; i < ARRAY_SIZE(vec_types); i++) {
+		/* FPSIMD via SVE regset */
+		if (getauxval(vec_types[i].hwcap_type) & vec_types[i].hwcap) {
+			ptrace_sve_fpsimd(child, &vec_types[i]);
+		} else {
+			ksft_test_result_skip("%s FPSIMD get via SVE\n",
+					      vec_types[i].name);
+			ksft_test_result_skip("%s FPSIMD set via SVE\n",
+					      vec_types[i].name);
+			ksft_test_result_skip("%s set read via FPSIMD\n",
+					      vec_types[i].name);
+		}
 
-		/* First, try to set this vector length */
-		ptrace_set_get_vl(child, vl, &vl_supported);
+		/* prctl() flags */
+		ptrace_set_get_inherit(child, &vec_types[i]);
+
+		/* Step through every possible VQ */
+		for (vq = SVE_VQ_MIN; vq <= SVE_VQ_MAX; vq++) {
+			vl = sve_vl_from_vq(vq);
+
+			/* First, try to set this vector length */
+			if (getauxval(vec_types[i].hwcap_type) &
+			    vec_types[i].hwcap) {
+				ptrace_set_get_vl(child, &vec_types[i], vl,
+						  &vl_supported);
+			} else {
+				ksft_test_result_skip("%s get/set VL %d\n",
+						      vec_types[i].name, vl);
+				vl_supported = false;
+			}
 
-		/* If the VL is supported validate data set/get */
-		if (vl_supported) {
-			ptrace_set_sve_get_sve_data(child, vl);
-			ptrace_set_sve_get_fpsimd_data(child, vl);
-		} else {
-			ksft_test_result_skip("set SVE get SVE for VL %d\n", vl);
-			ksft_test_result_skip("set SVE get FPSIMD for VL %d\n", vl);
+			/* If the VL is supported validate data set/get */
+			if (vl_supported) {
+				ptrace_set_sve_get_sve_data(child, &vec_types[i], vl);
+				ptrace_set_sve_get_fpsimd_data(child, &vec_types[i], vl);
+			} else {
+				ksft_test_result_skip("%s set SVE get SVE for VL %d\n",
+						      vec_types[i].name, vl);
+				ksft_test_result_skip("%s set SVE get FPSIMD for VL %d\n",
+						      vec_types[i].name, vl);
+			}
 		}
 	}
 
-- 
cgit v1.2.3


From 9331a604858a0f399ef8f886a88dd94927f136aa Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 10 Dec 2021 18:41:01 +0000
Subject: kselftest/arm64: Allow signal tests to trigger from a function

Currently we have the facility to specify custom code to trigger a signal
but none of the tests use it and for some reason the framework requires us
to also specify a signal to send as a trigger in order to make use of a
custom trigger. This doesn't seem to make much sense, instead allow the
use of a custom trigger function without specifying a signal to inject.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20211210184133.320748-6-broonie@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/signal/test_signals_utils.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.c b/tools/testing/selftests/arm64/signal/test_signals_utils.c
index 22722abc9dfa..2f8c23af3b5e 100644
--- a/tools/testing/selftests/arm64/signal/test_signals_utils.c
+++ b/tools/testing/selftests/arm64/signal/test_signals_utils.c
@@ -310,14 +310,12 @@ int test_setup(struct tdescr *td)
 
 int test_run(struct tdescr *td)
 {
-	if (td->sig_trig) {
-		if (td->trigger)
-			return td->trigger(td);
-		else
-			return default_trigger(td);
-	} else {
+	if (td->trigger)
+		return td->trigger(td);
+	else if (td->sig_trig)
+		return default_trigger(td);
+	else
 		return td->run(td, NULL, NULL);
-	}
 }
 
 void test_result(struct tdescr *td)
-- 
cgit v1.2.3


From b77e995e3b9638e759d482e93deba1fa0760cb26 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 10 Dec 2021 18:41:02 +0000
Subject: kselftest/arm64: Add a test program to exercise the syscall ABI

Currently we don't have any coverage of the syscall ABI so let's add a very
dumb test program which sets up register patterns, does a sysscall and then
checks that the register state after the syscall matches what we expect.
The program is written in an extremely simplistic fashion with the goal of
making it easy to verify that it's doing what it thinks it's doing, it is
not a model of how one should write actual code.

Currently we validate the general purpose, FPSIMD and SVE registers. There
are other thing things that could be covered like FPCR and flags registers,
these can be covered incrementally - my main focus at the minute is
covering the ABI for the SVE registers.

The program repeats the tests for all possible SVE vector lengths in case
some vector length specific optimisation causes issues, as well as testing
FPSIMD only. It tries two syscalls, getpid() and sched_yield(), in an
effort to cover both immediate return to userspace and scheduling another
task though there are no guarantees which cases will be hit.

A new test directory "abi" is added to hold the test, it doesn't seem to
fit well into any of the existing directories.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20211210184133.320748-7-broonie@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/Makefile             |   2 +-
 tools/testing/selftests/arm64/abi/.gitignore       |   1 +
 tools/testing/selftests/arm64/abi/Makefile         |   8 +
 .../testing/selftests/arm64/abi/syscall-abi-asm.S  | 240 ++++++++++++++++
 tools/testing/selftests/arm64/abi/syscall-abi.c    | 318 +++++++++++++++++++++
 5 files changed, 568 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/arm64/abi/.gitignore
 create mode 100644 tools/testing/selftests/arm64/abi/Makefile
 create mode 100644 tools/testing/selftests/arm64/abi/syscall-abi-asm.S
 create mode 100644 tools/testing/selftests/arm64/abi/syscall-abi.c

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/Makefile b/tools/testing/selftests/arm64/Makefile
index ced910fb4019..1e8d9a8f59df 100644
--- a/tools/testing/selftests/arm64/Makefile
+++ b/tools/testing/selftests/arm64/Makefile
@@ -4,7 +4,7 @@
 ARCH ?= $(shell uname -m 2>/dev/null || echo not)
 
 ifneq (,$(filter $(ARCH),aarch64 arm64))
-ARM64_SUBTARGETS ?= tags signal pauth fp mte bti
+ARM64_SUBTARGETS ?= tags signal pauth fp mte bti abi
 else
 ARM64_SUBTARGETS :=
 endif
diff --git a/tools/testing/selftests/arm64/abi/.gitignore b/tools/testing/selftests/arm64/abi/.gitignore
new file mode 100644
index 000000000000..b79cf5814c23
--- /dev/null
+++ b/tools/testing/selftests/arm64/abi/.gitignore
@@ -0,0 +1 @@
+syscall-abi
diff --git a/tools/testing/selftests/arm64/abi/Makefile b/tools/testing/selftests/arm64/abi/Makefile
new file mode 100644
index 000000000000..96eba974ac8d
--- /dev/null
+++ b/tools/testing/selftests/arm64/abi/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2021 ARM Limited
+
+TEST_GEN_PROGS := syscall-abi
+
+include ../../lib.mk
+
+$(OUTPUT)/syscall-abi: syscall-abi.c syscall-abi-asm.S
diff --git a/tools/testing/selftests/arm64/abi/syscall-abi-asm.S b/tools/testing/selftests/arm64/abi/syscall-abi-asm.S
new file mode 100644
index 000000000000..983467cfcee0
--- /dev/null
+++ b/tools/testing/selftests/arm64/abi/syscall-abi-asm.S
@@ -0,0 +1,240 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2021 ARM Limited.
+//
+// Assembly portion of the syscall ABI test
+
+//
+// Load values from memory into registers, invoke a syscall and save the
+// register values back to memory for later checking.  The syscall to be
+// invoked is configured in x8 of the input GPR data.
+//
+// x0:	SVE VL, 0 for FP only
+//
+//	GPRs:	gpr_in, gpr_out
+//	FPRs:	fpr_in, fpr_out
+//	Zn:	z_in, z_out
+//	Pn:	p_in, p_out
+//	FFR:	ffr_in, ffr_out
+
+.arch_extension sve
+
+.globl do_syscall
+do_syscall:
+	// Store callee saved registers x19-x29 (80 bytes) plus x0 and x1
+	stp	x29, x30, [sp, #-112]!
+	mov	x29, sp
+	stp	x0, x1, [sp, #16]
+	stp	x19, x20, [sp, #32]
+	stp	x21, x22, [sp, #48]
+	stp	x23, x24, [sp, #64]
+	stp	x25, x26, [sp, #80]
+	stp	x27, x28, [sp, #96]
+
+	// Load GPRs x8-x28, and save our SP/FP for later comparison
+	ldr	x2, =gpr_in
+	add	x2, x2, #64
+	ldp	x8, x9, [x2], #16
+	ldp	x10, x11, [x2], #16
+	ldp	x12, x13, [x2], #16
+	ldp	x14, x15, [x2], #16
+	ldp	x16, x17, [x2], #16
+	ldp	x18, x19, [x2], #16
+	ldp	x20, x21, [x2], #16
+	ldp	x22, x23, [x2], #16
+	ldp	x24, x25, [x2], #16
+	ldp	x26, x27, [x2], #16
+	ldr	x28, [x2], #8
+	str	x29, [x2], #8		// FP
+	str	x30, [x2], #8		// LR
+
+	// Load FPRs if we're not doing SVE
+	cbnz	x0, 1f
+	ldr	x2, =fpr_in
+	ldp	q0, q1, [x2]
+	ldp	q2, q3, [x2, #16 * 2]
+	ldp	q4, q5, [x2, #16 * 4]
+	ldp	q6, q7, [x2, #16 * 6]
+	ldp	q8, q9, [x2, #16 * 8]
+	ldp	q10, q11, [x2, #16 * 10]
+	ldp	q12, q13, [x2, #16 * 12]
+	ldp	q14, q15, [x2, #16 * 14]
+	ldp	q16, q17, [x2, #16 * 16]
+	ldp	q18, q19, [x2, #16 * 18]
+	ldp	q20, q21, [x2, #16 * 20]
+	ldp	q22, q23, [x2, #16 * 22]
+	ldp	q24, q25, [x2, #16 * 24]
+	ldp	q26, q27, [x2, #16 * 26]
+	ldp	q28, q29, [x2, #16 * 28]
+	ldp	q30, q31, [x2, #16 * 30]
+1:
+
+	// Load the SVE registers if we're doing SVE
+	cbz	x0, 1f
+
+	ldr	x2, =z_in
+	ldr	z0, [x2, #0, MUL VL]
+	ldr	z1, [x2, #1, MUL VL]
+	ldr	z2, [x2, #2, MUL VL]
+	ldr	z3, [x2, #3, MUL VL]
+	ldr	z4, [x2, #4, MUL VL]
+	ldr	z5, [x2, #5, MUL VL]
+	ldr	z6, [x2, #6, MUL VL]
+	ldr	z7, [x2, #7, MUL VL]
+	ldr	z8, [x2, #8, MUL VL]
+	ldr	z9, [x2, #9, MUL VL]
+	ldr	z10, [x2, #10, MUL VL]
+	ldr	z11, [x2, #11, MUL VL]
+	ldr	z12, [x2, #12, MUL VL]
+	ldr	z13, [x2, #13, MUL VL]
+	ldr	z14, [x2, #14, MUL VL]
+	ldr	z15, [x2, #15, MUL VL]
+	ldr	z16, [x2, #16, MUL VL]
+	ldr	z17, [x2, #17, MUL VL]
+	ldr	z18, [x2, #18, MUL VL]
+	ldr	z19, [x2, #19, MUL VL]
+	ldr	z20, [x2, #20, MUL VL]
+	ldr	z21, [x2, #21, MUL VL]
+	ldr	z22, [x2, #22, MUL VL]
+	ldr	z23, [x2, #23, MUL VL]
+	ldr	z24, [x2, #24, MUL VL]
+	ldr	z25, [x2, #25, MUL VL]
+	ldr	z26, [x2, #26, MUL VL]
+	ldr	z27, [x2, #27, MUL VL]
+	ldr	z28, [x2, #28, MUL VL]
+	ldr	z29, [x2, #29, MUL VL]
+	ldr	z30, [x2, #30, MUL VL]
+	ldr	z31, [x2, #31, MUL VL]
+
+	ldr	x2, =ffr_in
+	ldr	p0, [x2, #0]
+	wrffr	p0.b
+
+	ldr	x2, =p_in
+	ldr	p0, [x2, #0, MUL VL]
+	ldr	p1, [x2, #1, MUL VL]
+	ldr	p2, [x2, #2, MUL VL]
+	ldr	p3, [x2, #3, MUL VL]
+	ldr	p4, [x2, #4, MUL VL]
+	ldr	p5, [x2, #5, MUL VL]
+	ldr	p6, [x2, #6, MUL VL]
+	ldr	p7, [x2, #7, MUL VL]
+	ldr	p8, [x2, #8, MUL VL]
+	ldr	p9, [x2, #9, MUL VL]
+	ldr	p10, [x2, #10, MUL VL]
+	ldr	p11, [x2, #11, MUL VL]
+	ldr	p12, [x2, #12, MUL VL]
+	ldr	p13, [x2, #13, MUL VL]
+	ldr	p14, [x2, #14, MUL VL]
+	ldr	p15, [x2, #15, MUL VL]
+1:
+
+	// Do the syscall
+	svc	#0
+
+	// Save GPRs x8-x30
+	ldr	x2, =gpr_out
+	add	x2, x2, #64
+	stp	x8, x9, [x2], #16
+	stp	x10, x11, [x2], #16
+	stp	x12, x13, [x2], #16
+	stp	x14, x15, [x2], #16
+	stp	x16, x17, [x2], #16
+	stp	x18, x19, [x2], #16
+	stp	x20, x21, [x2], #16
+	stp	x22, x23, [x2], #16
+	stp	x24, x25, [x2], #16
+	stp	x26, x27, [x2], #16
+	stp	x28, x29, [x2], #16
+	str	x30, [x2]
+
+	// Restore x0 and x1 for feature checks
+	ldp	x0, x1, [sp, #16]
+
+	// Save FPSIMD state
+	ldr	x2, =fpr_out
+	stp	q0, q1, [x2]
+	stp	q2, q3, [x2, #16 * 2]
+	stp	q4, q5, [x2, #16 * 4]
+	stp	q6, q7, [x2, #16 * 6]
+	stp	q8, q9, [x2, #16 * 8]
+	stp	q10, q11, [x2, #16 * 10]
+	stp	q12, q13, [x2, #16 * 12]
+	stp	q14, q15, [x2, #16 * 14]
+	stp	q16, q17, [x2, #16 * 16]
+	stp	q18, q19, [x2, #16 * 18]
+	stp	q20, q21, [x2, #16 * 20]
+	stp	q22, q23, [x2, #16 * 22]
+	stp	q24, q25, [x2, #16 * 24]
+	stp	q26, q27, [x2, #16 * 26]
+	stp	q28, q29, [x2, #16 * 28]
+	stp	q30, q31, [x2, #16 * 30]
+
+	// Save the SVE state if we have some
+	cbz	x0, 1f
+
+	ldr	x2, =z_out
+	str	z0, [x2, #0, MUL VL]
+	str	z1, [x2, #1, MUL VL]
+	str	z2, [x2, #2, MUL VL]
+	str	z3, [x2, #3, MUL VL]
+	str	z4, [x2, #4, MUL VL]
+	str	z5, [x2, #5, MUL VL]
+	str	z6, [x2, #6, MUL VL]
+	str	z7, [x2, #7, MUL VL]
+	str	z8, [x2, #8, MUL VL]
+	str	z9, [x2, #9, MUL VL]
+	str	z10, [x2, #10, MUL VL]
+	str	z11, [x2, #11, MUL VL]
+	str	z12, [x2, #12, MUL VL]
+	str	z13, [x2, #13, MUL VL]
+	str	z14, [x2, #14, MUL VL]
+	str	z15, [x2, #15, MUL VL]
+	str	z16, [x2, #16, MUL VL]
+	str	z17, [x2, #17, MUL VL]
+	str	z18, [x2, #18, MUL VL]
+	str	z19, [x2, #19, MUL VL]
+	str	z20, [x2, #20, MUL VL]
+	str	z21, [x2, #21, MUL VL]
+	str	z22, [x2, #22, MUL VL]
+	str	z23, [x2, #23, MUL VL]
+	str	z24, [x2, #24, MUL VL]
+	str	z25, [x2, #25, MUL VL]
+	str	z26, [x2, #26, MUL VL]
+	str	z27, [x2, #27, MUL VL]
+	str	z28, [x2, #28, MUL VL]
+	str	z29, [x2, #29, MUL VL]
+	str	z30, [x2, #30, MUL VL]
+	str	z31, [x2, #31, MUL VL]
+
+	ldr	x2, =p_out
+	str	p0, [x2, #0, MUL VL]
+	str	p1, [x2, #1, MUL VL]
+	str	p2, [x2, #2, MUL VL]
+	str	p3, [x2, #3, MUL VL]
+	str	p4, [x2, #4, MUL VL]
+	str	p5, [x2, #5, MUL VL]
+	str	p6, [x2, #6, MUL VL]
+	str	p7, [x2, #7, MUL VL]
+	str	p8, [x2, #8, MUL VL]
+	str	p9, [x2, #9, MUL VL]
+	str	p10, [x2, #10, MUL VL]
+	str	p11, [x2, #11, MUL VL]
+	str	p12, [x2, #12, MUL VL]
+	str	p13, [x2, #13, MUL VL]
+	str	p14, [x2, #14, MUL VL]
+	str	p15, [x2, #15, MUL VL]
+
+	ldr	x2, =ffr_out
+	rdffr	p0.b
+	str	p0, [x2, #0]
+1:
+
+	// Restore callee saved registers x19-x30
+	ldp	x19, x20, [sp, #32]
+	ldp	x21, x22, [sp, #48]
+	ldp	x23, x24, [sp, #64]
+	ldp	x25, x26, [sp, #80]
+	ldp	x27, x28, [sp, #96]
+	ldp	x29, x30, [sp], #112
+
+	ret
diff --git a/tools/testing/selftests/arm64/abi/syscall-abi.c b/tools/testing/selftests/arm64/abi/syscall-abi.c
new file mode 100644
index 000000000000..d8eeeafb50dc
--- /dev/null
+++ b/tools/testing/selftests/arm64/abi/syscall-abi.c
@@ -0,0 +1,318 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021 ARM Limited.
+ */
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/auxv.h>
+#include <sys/prctl.h>
+#include <asm/hwcap.h>
+#include <asm/sigcontext.h>
+#include <asm/unistd.h>
+
+#include "../../kselftest.h"
+
+#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0]))
+#define NUM_VL ((SVE_VQ_MAX - SVE_VQ_MIN) + 1)
+
+extern void do_syscall(int sve_vl);
+
+static void fill_random(void *buf, size_t size)
+{
+	int i;
+	uint32_t *lbuf = buf;
+
+	/* random() returns a 32 bit number regardless of the size of long */
+	for (i = 0; i < size / sizeof(uint32_t); i++)
+		lbuf[i] = random();
+}
+
+/*
+ * We also repeat the test for several syscalls to try to expose different
+ * behaviour.
+ */
+static struct syscall_cfg {
+	int syscall_nr;
+	const char *name;
+} syscalls[] = {
+	{ __NR_getpid,		"getpid()" },
+	{ __NR_sched_yield,	"sched_yield()" },
+};
+
+#define NUM_GPR 31
+uint64_t gpr_in[NUM_GPR];
+uint64_t gpr_out[NUM_GPR];
+
+static void setup_gpr(struct syscall_cfg *cfg, int sve_vl)
+{
+	fill_random(gpr_in, sizeof(gpr_in));
+	gpr_in[8] = cfg->syscall_nr;
+	memset(gpr_out, 0, sizeof(gpr_out));
+}
+
+static int check_gpr(struct syscall_cfg *cfg, int sve_vl)
+{
+	int errors = 0;
+	int i;
+
+	/*
+	 * GPR x0-x7 may be clobbered, and all others should be preserved.
+	 */
+	for (i = 9; i < ARRAY_SIZE(gpr_in); i++) {
+		if (gpr_in[i] != gpr_out[i]) {
+			ksft_print_msg("%s SVE VL %d mismatch in GPR %d: %llx != %llx\n",
+				       cfg->name, sve_vl, i,
+				       gpr_in[i], gpr_out[i]);
+			errors++;
+		}
+	}
+
+	return errors;
+}
+
+#define NUM_FPR 32
+uint64_t fpr_in[NUM_FPR * 2];
+uint64_t fpr_out[NUM_FPR * 2];
+
+static void setup_fpr(struct syscall_cfg *cfg, int sve_vl)
+{
+	fill_random(fpr_in, sizeof(fpr_in));
+	memset(fpr_out, 0, sizeof(fpr_out));
+}
+
+static int check_fpr(struct syscall_cfg *cfg, int sve_vl)
+{
+	int errors = 0;
+	int i;
+
+	if (!sve_vl) {
+		for (i = 0; i < ARRAY_SIZE(fpr_in); i++) {
+			if (fpr_in[i] != fpr_out[i]) {
+				ksft_print_msg("%s Q%d/%d mismatch %llx != %llx\n",
+					       cfg->name,
+					       i / 2, i % 2,
+					       fpr_in[i], fpr_out[i]);
+				errors++;
+			}
+		}
+	}
+
+	return errors;
+}
+
+static uint8_t z_zero[__SVE_ZREG_SIZE(SVE_VQ_MAX)];
+uint8_t z_in[SVE_NUM_PREGS * __SVE_ZREG_SIZE(SVE_VQ_MAX)];
+uint8_t z_out[SVE_NUM_PREGS * __SVE_ZREG_SIZE(SVE_VQ_MAX)];
+
+static void setup_z(struct syscall_cfg *cfg, int sve_vl)
+{
+	fill_random(z_in, sizeof(z_in));
+	fill_random(z_out, sizeof(z_out));
+}
+
+static int check_z(struct syscall_cfg *cfg, int sve_vl)
+{
+	size_t reg_size = sve_vl;
+	int errors = 0;
+	int i;
+
+	if (!sve_vl)
+		return 0;
+
+	/*
+	 * After a syscall the low 128 bits of the Z registers should
+	 * be preserved and the rest be zeroed or preserved.
+	 */
+	for (i = 0; i < SVE_NUM_ZREGS; i++) {
+		void *in = &z_in[reg_size * i];
+		void *out = &z_out[reg_size * i];
+
+		if (memcmp(in, out, SVE_VQ_BYTES) != 0) {
+			ksft_print_msg("%s SVE VL %d Z%d low 128 bits changed\n",
+				       cfg->name, sve_vl, i);
+			errors++;
+		}
+	}
+
+	return errors;
+}
+
+uint8_t p_in[SVE_NUM_PREGS * __SVE_PREG_SIZE(SVE_VQ_MAX)];
+uint8_t p_out[SVE_NUM_PREGS * __SVE_PREG_SIZE(SVE_VQ_MAX)];
+
+static void setup_p(struct syscall_cfg *cfg, int sve_vl)
+{
+	fill_random(p_in, sizeof(p_in));
+	fill_random(p_out, sizeof(p_out));
+}
+
+static int check_p(struct syscall_cfg *cfg, int sve_vl)
+{
+	size_t reg_size = sve_vq_from_vl(sve_vl) * 2; /* 1 bit per VL byte */
+
+	int errors = 0;
+	int i;
+
+	if (!sve_vl)
+		return 0;
+
+	/* After a syscall the P registers should be preserved or zeroed */
+	for (i = 0; i < SVE_NUM_PREGS * reg_size; i++)
+		if (p_out[i] && (p_in[i] != p_out[i]))
+			errors++;
+	if (errors)
+		ksft_print_msg("%s SVE VL %d predicate registers non-zero\n",
+			       cfg->name, sve_vl);
+
+	return errors;
+}
+
+uint8_t ffr_in[__SVE_PREG_SIZE(SVE_VQ_MAX)];
+uint8_t ffr_out[__SVE_PREG_SIZE(SVE_VQ_MAX)];
+
+static void setup_ffr(struct syscall_cfg *cfg, int sve_vl)
+{
+	/*
+	 * It is only valid to set a contiguous set of bits starting
+	 * at 0.  For now since we're expecting this to be cleared by
+	 * a syscall just set all bits.
+	 */
+	memset(ffr_in, 0xff, sizeof(ffr_in));
+	fill_random(ffr_out, sizeof(ffr_out));
+}
+
+static int check_ffr(struct syscall_cfg *cfg, int sve_vl)
+{
+	size_t reg_size = sve_vq_from_vl(sve_vl) * 2;  /* 1 bit per VL byte */
+	int errors = 0;
+	int i;
+
+	if (!sve_vl)
+		return 0;
+
+	/* After a syscall the P registers should be preserved or zeroed */
+	for (i = 0; i < reg_size; i++)
+		if (ffr_out[i] && (ffr_in[i] != ffr_out[i]))
+			errors++;
+	if (errors)
+		ksft_print_msg("%s SVE VL %d FFR non-zero\n",
+			       cfg->name, sve_vl);
+
+	return errors;
+}
+
+typedef void (*setup_fn)(struct syscall_cfg *cfg, int sve_vl);
+typedef int (*check_fn)(struct syscall_cfg *cfg, int sve_vl);
+
+/*
+ * Each set of registers has a setup function which is called before
+ * the syscall to fill values in a global variable for loading by the
+ * test code and a check function which validates that the results are
+ * as expected.  Vector lengths are passed everywhere, a vector length
+ * of 0 should be treated as do not test.
+ */
+static struct {
+	setup_fn setup;
+	check_fn check;
+} regset[] = {
+	{ setup_gpr, check_gpr },
+	{ setup_fpr, check_fpr },
+	{ setup_z, check_z },
+	{ setup_p, check_p },
+	{ setup_ffr, check_ffr },
+};
+
+static bool do_test(struct syscall_cfg *cfg, int sve_vl)
+{
+	int errors = 0;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(regset); i++)
+		regset[i].setup(cfg, sve_vl);
+
+	do_syscall(sve_vl);
+
+	for (i = 0; i < ARRAY_SIZE(regset); i++)
+		errors += regset[i].check(cfg, sve_vl);
+
+	return errors == 0;
+}
+
+static void test_one_syscall(struct syscall_cfg *cfg)
+{
+	int sve_vq, sve_vl;
+
+	/* FPSIMD only case */
+	ksft_test_result(do_test(cfg, 0),
+			 "%s FPSIMD\n", cfg->name);
+
+	if (!(getauxval(AT_HWCAP) & HWCAP_SVE))
+		return;
+
+	for (sve_vq = SVE_VQ_MAX; sve_vq > 0; --sve_vq) {
+		sve_vl = prctl(PR_SVE_SET_VL, sve_vq * 16);
+		if (sve_vl == -1)
+			ksft_exit_fail_msg("PR_SVE_SET_VL failed: %s (%d)\n",
+					   strerror(errno), errno);
+
+		sve_vl &= PR_SVE_VL_LEN_MASK;
+
+		if (sve_vq != sve_vq_from_vl(sve_vl))
+			sve_vq = sve_vq_from_vl(sve_vl);
+
+		ksft_test_result(do_test(cfg, sve_vl),
+				 "%s SVE VL %d\n", cfg->name, sve_vl);
+	}
+}
+
+int sve_count_vls(void)
+{
+	unsigned int vq;
+	int vl_count = 0;
+	int vl;
+
+	if (!(getauxval(AT_HWCAP) & HWCAP_SVE))
+		return 0;
+
+	/*
+	 * Enumerate up to SVE_VQ_MAX vector lengths
+	 */
+	for (vq = SVE_VQ_MAX; vq > 0; --vq) {
+		vl = prctl(PR_SVE_SET_VL, vq * 16);
+		if (vl == -1)
+			ksft_exit_fail_msg("PR_SVE_SET_VL failed: %s (%d)\n",
+					   strerror(errno), errno);
+
+		vl &= PR_SVE_VL_LEN_MASK;
+
+		if (vq != sve_vq_from_vl(vl))
+			vq = sve_vq_from_vl(vl);
+
+		vl_count++;
+	}
+
+	return vl_count;
+}
+
+int main(void)
+{
+	int i;
+
+	srandom(getpid());
+
+	ksft_print_header();
+	ksft_set_plan(ARRAY_SIZE(syscalls) * (sve_count_vls() + 1));
+
+	for (i = 0; i < ARRAY_SIZE(syscalls); i++)
+		test_one_syscall(&syscalls[i]);
+
+	ksft_print_cnts();
+
+	return 0;
+}
-- 
cgit v1.2.3


From 2c94ebedc844d64459bfe73b25ae725c3f040526 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 2 Dec 2021 16:51:07 +0000
Subject: kselftest/arm64: Add pidbench for floating point syscall cases

Since it's likely to be useful for performance work with SVE let's have a
pidbench that gives us some numbers for consideration. In order to ensure
that we test exactly the scenario we want this is written in assembly - if
system libraries use SVE this would stop us exercising the case where the
process has never used SVE.

We exercise three cases:

 - Never having used SVE.
 - Having used SVE once.
 - Using SVE after each syscall.

by spinning running getpid() for a fixed number of iterations with the
time measured using CNTVCT_EL0 reported on the console. This is obviously
a totally unrealistic benchmark which will show the extremes of any
performance variation but equally given the potential gotchas with use of
FP instructions by system libraries it's good to have some concrete code
shared to make it easier to compare notes on results.

Testing over multiple SVE vector lengths will need to be done with vlset
currently, the test could be extended to iterate over all of them if
desired.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20211202165107.1075259-1-broonie@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/.gitignore    |  1 +
 tools/testing/selftests/arm64/fp/Makefile      |  4 +-
 tools/testing/selftests/arm64/fp/fp-pidbench.S | 71 ++++++++++++++++++++++++++
 3 files changed, 75 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/arm64/fp/fp-pidbench.S

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/fp/.gitignore b/tools/testing/selftests/arm64/fp/.gitignore
index b67395903b9b..c50d86331ed2 100644
--- a/tools/testing/selftests/arm64/fp/.gitignore
+++ b/tools/testing/selftests/arm64/fp/.gitignore
@@ -1,3 +1,4 @@
+fp-pidbench
 fpsimd-test
 rdvl-sve
 sve-probe-vls
diff --git a/tools/testing/selftests/arm64/fp/Makefile b/tools/testing/selftests/arm64/fp/Makefile
index ba1488c7c315..95f0b877a060 100644
--- a/tools/testing/selftests/arm64/fp/Makefile
+++ b/tools/testing/selftests/arm64/fp/Makefile
@@ -2,13 +2,15 @@
 
 CFLAGS += -I../../../../../usr/include/
 TEST_GEN_PROGS := sve-ptrace sve-probe-vls vec-syscfg
-TEST_PROGS_EXTENDED := fpsimd-test fpsimd-stress \
+TEST_PROGS_EXTENDED := fp-pidbench fpsimd-test fpsimd-stress \
 	rdvl-sve \
 	sve-test sve-stress \
 	vlset
 
 all: $(TEST_GEN_PROGS) $(TEST_PROGS_EXTENDED)
 
+fp-pidbench: fp-pidbench.S asm-utils.o
+	$(CC) -nostdlib $^ -o $@
 fpsimd-test: fpsimd-test.o asm-utils.o
 	$(CC) -nostdlib $^ -o $@
 rdvl-sve: rdvl-sve.o rdvl.o
diff --git a/tools/testing/selftests/arm64/fp/fp-pidbench.S b/tools/testing/selftests/arm64/fp/fp-pidbench.S
new file mode 100644
index 000000000000..16a436389bfc
--- /dev/null
+++ b/tools/testing/selftests/arm64/fp/fp-pidbench.S
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2021 ARM Limited.
+// Original author: Mark Brown <broonie@kernel.org>
+//
+// Trivial syscall overhead benchmark.
+//
+// This is implemented in asm to ensure that we don't have any issues with
+// system libraries using instructions that disrupt the test.
+
+#include <asm/unistd.h>
+#include "assembler.h"
+
+.arch_extension sve
+
+.macro test_loop per_loop
+	mov	x10, x20
+	mov	x8, #__NR_getpid
+	mrs	x11, CNTVCT_EL0
+1:
+	\per_loop
+	svc	#0
+	sub	x10, x10, #1
+	cbnz	x10, 1b
+
+	mrs	x12, CNTVCT_EL0
+	sub	x0, x12, x11
+	bl	putdec
+	puts	"\n"
+.endm
+
+// Main program entry point
+.globl _start
+function _start
+_start:
+	puts	"Iterations per test: "
+	mov	x20, #10000
+	lsl	x20, x20, #8
+	mov	x0, x20
+	bl	putdec
+	puts	"\n"
+
+	// Test having never used SVE
+	puts	"No SVE: "
+	test_loop
+
+	// Check for SVE support - should use hwcap but that's hard in asm
+	mrs	x0, ID_AA64PFR0_EL1
+	ubfx	x0, x0, #32, #4
+	cbnz	x0, 1f
+	puts	"System does not support SVE\n"
+	b	out
+1:
+
+	// Execute a SVE instruction
+	puts	"SVE VL: "
+	rdvl	x0, #8
+	bl	putdec
+	puts	"\n"
+
+	puts	"SVE used once: "
+	test_loop
+
+	// Use SVE per syscall
+	puts	"SVE used per syscall: "
+	test_loop "rdvl x0, #8"
+
+	//  And we're done
+out:
+	mov	x0, #0
+	mov	x8, #__NR_exit
+	svc	#0
-- 
cgit v1.2.3


From e542f2c4cd16d49392abf3349341d58153d3c603 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 14 Dec 2021 11:59:03 -0800
Subject: libbpf: Auto-bump RLIMIT_MEMLOCK if kernel needs it for BPF

The need to increase RLIMIT_MEMLOCK to do anything useful with BPF is
one of the first extremely frustrating gotchas that all new BPF users go
through and in some cases have to learn it a very hard way.

Luckily, starting with upstream Linux kernel version 5.11, BPF subsystem
dropped the dependency on memlock and uses memcg-based memory accounting
instead. Unfortunately, detecting memcg-based BPF memory accounting is
far from trivial (as can be evidenced by this patch), so in practice
most BPF applications still do unconditional RLIMIT_MEMLOCK increase.

As we move towards libbpf 1.0, it would be good to allow users to forget
about RLIMIT_MEMLOCK vs memcg and let libbpf do the sensible adjustment
automatically. This patch paves the way forward in this matter. Libbpf
will do feature detection of memcg-based accounting, and if detected,
will do nothing. But if the kernel is too old, just like BCC, libbpf
will automatically increase RLIMIT_MEMLOCK on behalf of user
application ([0]).

As this is technically a breaking change, during the transition period
applications have to opt into libbpf 1.0 mode by setting
LIBBPF_STRICT_AUTO_RLIMIT_MEMLOCK bit when calling
libbpf_set_strict_mode().

Libbpf allows to control the exact amount of set RLIMIT_MEMLOCK limit
with libbpf_set_memlock_rlim_max() API. Passing 0 will make libbpf do
nothing with RLIMIT_MEMLOCK. libbpf_set_memlock_rlim_max() has to be
called before the first bpf_prog_load(), bpf_btf_load(), or
bpf_object__load() call, otherwise it has no effect and will return
-EBUSY.

  [0] Closes: https://github.com/libbpf/libbpf/issues/369

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211214195904.1785155-2-andrii@kernel.org
---
 tools/lib/bpf/bpf.c             | 81 +++++++++++++++++++++++++++++++++++++++++
 tools/lib/bpf/bpf.h             |  2 +
 tools/lib/bpf/libbpf.c          | 47 +++++-------------------
 tools/lib/bpf/libbpf.map        |  1 +
 tools/lib/bpf/libbpf_internal.h | 39 ++++++++++++++++++++
 tools/lib/bpf/libbpf_legacy.h   | 12 +++++-
 6 files changed, 143 insertions(+), 39 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 1f84d706eb3e..9b64eed2b003 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -28,7 +28,9 @@
 #include <asm/unistd.h>
 #include <errno.h>
 #include <linux/bpf.h>
+#include <linux/filter.h>
 #include <limits.h>
+#include <sys/resource.h>
 #include "bpf.h"
 #include "libbpf.h"
 #include "libbpf_internal.h"
@@ -94,6 +96,77 @@ static inline int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size, int
 	return fd;
 }
 
+/* Probe whether kernel switched from memlock-based (RLIMIT_MEMLOCK) to
+ * memcg-based memory accounting for BPF maps and progs. This was done in [0].
+ * We use the support for bpf_ktime_get_coarse_ns() helper, which was added in
+ * the same 5.11 Linux release ([1]), to detect memcg-based accounting for BPF.
+ *
+ *   [0] https://lore.kernel.org/bpf/20201201215900.3569844-1-guro@fb.com/
+ *   [1] d05512618056 ("bpf: Add bpf_ktime_get_coarse_ns helper")
+ */
+int probe_memcg_account(void)
+{
+	const size_t prog_load_attr_sz = offsetofend(union bpf_attr, attach_btf_obj_fd);
+	struct bpf_insn insns[] = {
+		BPF_EMIT_CALL(BPF_FUNC_ktime_get_coarse_ns),
+		BPF_EXIT_INSN(),
+	};
+	size_t insn_cnt = sizeof(insns) / sizeof(insns[0]);
+	union bpf_attr attr;
+	int prog_fd;
+
+	/* attempt loading freplace trying to use custom BTF */
+	memset(&attr, 0, prog_load_attr_sz);
+	attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+	attr.insns = ptr_to_u64(insns);
+	attr.insn_cnt = insn_cnt;
+	attr.license = ptr_to_u64("GPL");
+
+	prog_fd = sys_bpf_fd(BPF_PROG_LOAD, &attr, prog_load_attr_sz);
+	if (prog_fd >= 0) {
+		close(prog_fd);
+		return 1;
+	}
+	return 0;
+}
+
+static bool memlock_bumped;
+static rlim_t memlock_rlim = RLIM_INFINITY;
+
+int libbpf_set_memlock_rlim(size_t memlock_bytes)
+{
+	if (memlock_bumped)
+		return libbpf_err(-EBUSY);
+
+	memlock_rlim = memlock_bytes;
+	return 0;
+}
+
+int bump_rlimit_memlock(void)
+{
+	struct rlimit rlim;
+
+	/* this the default in libbpf 1.0, but for now user has to opt-in explicitly */
+	if (!(libbpf_mode & LIBBPF_STRICT_AUTO_RLIMIT_MEMLOCK))
+		return 0;
+
+	/* if kernel supports memcg-based accounting, skip bumping RLIMIT_MEMLOCK */
+	if (memlock_bumped || kernel_supports(NULL, FEAT_MEMCG_ACCOUNT))
+		return 0;
+
+	memlock_bumped = true;
+
+	/* zero memlock_rlim_max disables auto-bumping RLIMIT_MEMLOCK */
+	if (memlock_rlim == 0)
+		return 0;
+
+	rlim.rlim_cur = rlim.rlim_max = memlock_rlim;
+	if (setrlimit(RLIMIT_MEMLOCK, &rlim))
+		return -errno;
+
+	return 0;
+}
+
 int bpf_map_create(enum bpf_map_type map_type,
 		   const char *map_name,
 		   __u32 key_size,
@@ -105,6 +178,8 @@ int bpf_map_create(enum bpf_map_type map_type,
 	union bpf_attr attr;
 	int fd;
 
+	bump_rlimit_memlock();
+
 	memset(&attr, 0, attr_sz);
 
 	if (!OPTS_VALID(opts, bpf_map_create_opts))
@@ -251,6 +326,8 @@ int bpf_prog_load_v0_6_0(enum bpf_prog_type prog_type,
 	union bpf_attr attr;
 	char *log_buf;
 
+	bump_rlimit_memlock();
+
 	if (!OPTS_VALID(opts, bpf_prog_load_opts))
 		return libbpf_err(-EINVAL);
 
@@ -456,6 +533,8 @@ int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns,
 	union bpf_attr attr;
 	int fd;
 
+	bump_rlimit_memlock();
+
 	memset(&attr, 0, sizeof(attr));
 	attr.prog_type = type;
 	attr.insn_cnt = (__u32)insns_cnt;
@@ -1056,6 +1135,8 @@ int bpf_btf_load(const void *btf_data, size_t btf_size, const struct bpf_btf_loa
 	__u32 log_level;
 	int fd;
 
+	bump_rlimit_memlock();
+
 	memset(&attr, 0, attr_sz);
 
 	if (!OPTS_VALID(opts, bpf_btf_load_opts))
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 94e553a0ff9d..00619f64a040 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -35,6 +35,8 @@
 extern "C" {
 #endif
 
+int libbpf_set_memlock_rlim(size_t memlock_bytes);
+
 struct bpf_map_create_opts {
 	size_t sz; /* size of this struct for forward/backward compatibility */
 
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index c6d84b01a59e..684c34ce1a26 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -187,42 +187,6 @@ const char *libbpf_version_string(void)
 #undef __S
 }
 
-enum kern_feature_id {
-	/* v4.14: kernel support for program & map names. */
-	FEAT_PROG_NAME,
-	/* v5.2: kernel support for global data sections. */
-	FEAT_GLOBAL_DATA,
-	/* BTF support */
-	FEAT_BTF,
-	/* BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO support */
-	FEAT_BTF_FUNC,
-	/* BTF_KIND_VAR and BTF_KIND_DATASEC support */
-	FEAT_BTF_DATASEC,
-	/* BTF_FUNC_GLOBAL is supported */
-	FEAT_BTF_GLOBAL_FUNC,
-	/* BPF_F_MMAPABLE is supported for arrays */
-	FEAT_ARRAY_MMAP,
-	/* kernel support for expected_attach_type in BPF_PROG_LOAD */
-	FEAT_EXP_ATTACH_TYPE,
-	/* bpf_probe_read_{kernel,user}[_str] helpers */
-	FEAT_PROBE_READ_KERN,
-	/* BPF_PROG_BIND_MAP is supported */
-	FEAT_PROG_BIND_MAP,
-	/* Kernel support for module BTFs */
-	FEAT_MODULE_BTF,
-	/* BTF_KIND_FLOAT support */
-	FEAT_BTF_FLOAT,
-	/* BPF perf link support */
-	FEAT_PERF_LINK,
-	/* BTF_KIND_DECL_TAG support */
-	FEAT_BTF_DECL_TAG,
-	/* BTF_KIND_TYPE_TAG support */
-	FEAT_BTF_TYPE_TAG,
-	__FEAT_CNT,
-};
-
-static bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id feat_id);
-
 enum reloc_type {
 	RELO_LD64,
 	RELO_CALL,
@@ -4352,6 +4316,10 @@ bpf_object__probe_loading(struct bpf_object *obj)
 	if (obj->gen_loader)
 		return 0;
 
+	ret = bump_rlimit_memlock();
+	if (ret)
+		pr_warn("Failed to bump RLIMIT_MEMLOCK (err = %d), you might need to do it explicitly!\n", ret);
+
 	/* make sure basic loading works */
 	ret = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, NULL);
 	if (ret < 0)
@@ -4718,14 +4686,17 @@ static struct kern_feature_desc {
 	[FEAT_BTF_TYPE_TAG] = {
 		"BTF_KIND_TYPE_TAG support", probe_kern_btf_type_tag,
 	},
+	[FEAT_MEMCG_ACCOUNT] = {
+		"memcg-based memory accounting", probe_memcg_account,
+	},
 };
 
-static bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id feat_id)
+bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id feat_id)
 {
 	struct kern_feature_desc *feat = &feature_probes[feat_id];
 	int ret;
 
-	if (obj->gen_loader)
+	if (obj && obj->gen_loader)
 		/* To generate loader program assume the latest kernel
 		 * to avoid doing extra prog_load, map_create syscalls.
 		 */
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 4d483af7dba6..b3938b3f8fc9 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -427,4 +427,5 @@ LIBBPF_0.7.0 {
 		bpf_program__log_level;
 		bpf_program__set_log_buf;
 		bpf_program__set_log_level;
+		libbpf_set_memlock_rlim_max;
 };
diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index 5e8166a2f3d8..5dbe4f463880 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -291,6 +291,45 @@ static inline bool libbpf_validate_opts(const char *opts,
 					(opts)->sz - __off);		      \
 })
 
+enum kern_feature_id {
+	/* v4.14: kernel support for program & map names. */
+	FEAT_PROG_NAME,
+	/* v5.2: kernel support for global data sections. */
+	FEAT_GLOBAL_DATA,
+	/* BTF support */
+	FEAT_BTF,
+	/* BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO support */
+	FEAT_BTF_FUNC,
+	/* BTF_KIND_VAR and BTF_KIND_DATASEC support */
+	FEAT_BTF_DATASEC,
+	/* BTF_FUNC_GLOBAL is supported */
+	FEAT_BTF_GLOBAL_FUNC,
+	/* BPF_F_MMAPABLE is supported for arrays */
+	FEAT_ARRAY_MMAP,
+	/* kernel support for expected_attach_type in BPF_PROG_LOAD */
+	FEAT_EXP_ATTACH_TYPE,
+	/* bpf_probe_read_{kernel,user}[_str] helpers */
+	FEAT_PROBE_READ_KERN,
+	/* BPF_PROG_BIND_MAP is supported */
+	FEAT_PROG_BIND_MAP,
+	/* Kernel support for module BTFs */
+	FEAT_MODULE_BTF,
+	/* BTF_KIND_FLOAT support */
+	FEAT_BTF_FLOAT,
+	/* BPF perf link support */
+	FEAT_PERF_LINK,
+	/* BTF_KIND_DECL_TAG support */
+	FEAT_BTF_DECL_TAG,
+	/* BTF_KIND_TYPE_TAG support */
+	FEAT_BTF_TYPE_TAG,
+	/* memcg-based accounting for BPF maps and progs */
+	FEAT_MEMCG_ACCOUNT,
+	__FEAT_CNT,
+};
+
+int probe_memcg_account(void);
+bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id feat_id);
+int bump_rlimit_memlock(void);
 
 int parse_cpu_mask_str(const char *s, bool **mask, int *mask_sz);
 int parse_cpu_mask_file(const char *fcpu, bool **mask, int *mask_sz);
diff --git a/tools/lib/bpf/libbpf_legacy.h b/tools/lib/bpf/libbpf_legacy.h
index bb03c568af7b..79131f761a27 100644
--- a/tools/lib/bpf/libbpf_legacy.h
+++ b/tools/lib/bpf/libbpf_legacy.h
@@ -45,7 +45,6 @@ enum libbpf_strict_mode {
 	 * (positive) error code.
 	 */
 	LIBBPF_STRICT_DIRECT_ERRS = 0x02,
-
 	/*
 	 * Enforce strict BPF program section (SEC()) names.
 	 * E.g., while prefiously SEC("xdp_whatever") or SEC("perf_event_blah") were
@@ -63,6 +62,17 @@ enum libbpf_strict_mode {
 	 * Clients can maintain it on their own if it is valuable for them.
 	 */
 	LIBBPF_STRICT_NO_OBJECT_LIST = 0x08,
+	/*
+	 * Automatically bump RLIMIT_MEMLOCK using setrlimit() before the
+	 * first BPF program or map creation operation. This is done only if
+	 * kernel is too old to support memcg-based memory accounting for BPF
+	 * subsystem. By default, RLIMIT_MEMLOCK limit is set to RLIM_INFINITY,
+	 * but it can be overriden with libbpf_set_memlock_rlim_max() API.
+	 * Note that libbpf_set_memlock_rlim_max() needs to be called before
+	 * the very first bpf_prog_load(), bpf_map_create() or bpf_object__load()
+	 * operation.
+	 */
+	LIBBPF_STRICT_AUTO_RLIMIT_MEMLOCK = 0x10,
 
 	__LIBBPF_STRICT_LAST,
 };
-- 
cgit v1.2.3


From c164b8b40422ef5c643d08bbc63280e1e1610573 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 14 Dec 2021 11:59:04 -0800
Subject: selftests/bpf: Remove explicit setrlimit(RLIMIT_MEMLOCK) in main
 selftests

As libbpf now is able to automatically take care of RLIMIT_MEMLOCK
increase (or skip it altogether on recent enough kernels), remove
explicit setrlimit() invocations in bench, test_maps, test_verifier, and
test_progs.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211214195904.1785155-3-andrii@kernel.org
---
 tools/testing/selftests/bpf/bench.c                      | 16 ----------------
 tools/testing/selftests/bpf/prog_tests/btf.c             |  1 -
 .../testing/selftests/bpf/prog_tests/select_reuseport.c  |  1 -
 tools/testing/selftests/bpf/prog_tests/sk_lookup.c       |  1 -
 tools/testing/selftests/bpf/prog_tests/sock_fields.c     |  1 -
 tools/testing/selftests/bpf/test_maps.c                  |  1 -
 tools/testing/selftests/bpf/test_progs.c                 |  2 --
 tools/testing/selftests/bpf/test_verifier.c              |  4 +++-
 8 files changed, 3 insertions(+), 24 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index bbb42e2cee0c..f973320e6dbf 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -29,26 +29,10 @@ static int libbpf_print_fn(enum libbpf_print_level level,
 	return vfprintf(stderr, format, args);
 }
 
-static int bump_memlock_rlimit(void)
-{
-	struct rlimit rlim_new = {
-		.rlim_cur	= RLIM_INFINITY,
-		.rlim_max	= RLIM_INFINITY,
-	};
-
-	return setrlimit(RLIMIT_MEMLOCK, &rlim_new);
-}
-
 void setup_libbpf(void)
 {
-	int err;
-
 	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
 	libbpf_set_print(libbpf_print_fn);
-
-	err = bump_memlock_rlimit();
-	if (err)
-		fprintf(stderr, "failed to increase RLIMIT_MEMLOCK: %d", err);
 }
 
 void false_hits_report_progress(int iter, struct bench_res *res, long delta_ns)
diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c
index 01b776a7beeb..8ba53acf9eb4 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf.c
@@ -22,7 +22,6 @@
 #include <bpf/libbpf.h>
 #include <bpf/btf.h>
 
-#include "bpf_rlimit.h"
 #include "bpf_util.h"
 #include "../test_btf.h"
 #include "test_progs.h"
diff --git a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c
index 980ac0f2c0bb..1cbd8cd64044 100644
--- a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c
+++ b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c
@@ -18,7 +18,6 @@
 #include <netinet/in.h>
 #include <bpf/bpf.h>
 #include <bpf/libbpf.h>
-#include "bpf_rlimit.h"
 #include "bpf_util.h"
 
 #include "test_progs.h"
diff --git a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
index 57846cc7ce36..597d0467a926 100644
--- a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
+++ b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
@@ -30,7 +30,6 @@
 #include <bpf/bpf.h>
 
 #include "test_progs.h"
-#include "bpf_rlimit.h"
 #include "bpf_util.h"
 #include "cgroup_helpers.h"
 #include "network_helpers.h"
diff --git a/tools/testing/selftests/bpf/prog_tests/sock_fields.c b/tools/testing/selftests/bpf/prog_tests/sock_fields.c
index fae40db4d81f..9fc040eaa482 100644
--- a/tools/testing/selftests/bpf/prog_tests/sock_fields.c
+++ b/tools/testing/selftests/bpf/prog_tests/sock_fields.c
@@ -15,7 +15,6 @@
 #include "network_helpers.h"
 #include "cgroup_helpers.h"
 #include "test_progs.h"
-#include "bpf_rlimit.h"
 #include "test_sock_fields.skel.h"
 
 enum bpf_linum_array_idx {
diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index f4cd658bbe00..50f7e74ca0b9 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -23,7 +23,6 @@
 #include <bpf/libbpf.h>
 
 #include "bpf_util.h"
-#include "bpf_rlimit.h"
 #include "test_maps.h"
 #include "testing_helpers.h"
 
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 296928948bb9..2ecb73a65206 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -4,7 +4,6 @@
 #define _GNU_SOURCE
 #include "test_progs.h"
 #include "cgroup_helpers.h"
-#include "bpf_rlimit.h"
 #include <argp.h>
 #include <pthread.h>
 #include <sched.h>
@@ -1342,7 +1341,6 @@ int main(int argc, char **argv)
 
 	/* Use libbpf 1.0 API mode */
 	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
-
 	libbpf_set_print(libbpf_print_fn);
 
 	srand(time(NULL));
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index ad5d30bafd93..b0bd2a1f6d52 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -41,7 +41,6 @@
 #  define CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS 1
 # endif
 #endif
-#include "bpf_rlimit.h"
 #include "bpf_rand.h"
 #include "bpf_util.h"
 #include "test_btf.h"
@@ -1395,6 +1394,9 @@ int main(int argc, char **argv)
 		return EXIT_FAILURE;
 	}
 
+	/* Use libbpf 1.0 API mode */
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
 	bpf_semi_rand_init();
 	return do_test(unpriv, from, to);
 }
-- 
cgit v1.2.3


From a393ea80a22a9beffdc9a527bd2f9f270e7a0c6e Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@fb.com>
Date: Mon, 13 Dec 2021 19:59:28 -0800
Subject: selftests/bpf: Stop using bpf_object__find_program_by_title API.

bpf_object__find_program_by_title is going to be deprecated.  Replace
all use cases in tools/testing/selftests/bpf with
bpf_object__find_program_by_name or bpf_object__for_each_program.

Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211214035931.1148209-2-kuifeng@fb.com
---
 .../testing/selftests/bpf/prog_tests/bpf_obj_id.c  |  4 +-
 .../selftests/bpf/prog_tests/connect_force_port.c  | 18 ++---
 .../testing/selftests/bpf/prog_tests/core_reloc.c  | 79 +++++++++++++++-------
 .../selftests/bpf/prog_tests/fexit_bpf2bpf.c       | 17 +++--
 .../selftests/bpf/prog_tests/get_stack_raw_tp.c    |  4 +-
 .../selftests/bpf/prog_tests/sockopt_inherit.c     | 15 ++--
 .../selftests/bpf/prog_tests/stacktrace_map.c      |  4 +-
 .../bpf/prog_tests/stacktrace_map_raw_tp.c         |  4 +-
 .../selftests/bpf/prog_tests/test_overhead.c       | 20 +++---
 .../selftests/bpf/prog_tests/trampoline_count.c    |  6 +-
 10 files changed, 104 insertions(+), 67 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c b/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c
index 0a6c5f00abd4..dbe56fa8582d 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c
@@ -65,8 +65,8 @@ void serial_test_bpf_obj_id(void)
 		if (CHECK_FAIL(err))
 			goto done;
 
-		prog = bpf_object__find_program_by_title(objs[i],
-							 "raw_tp/sys_enter");
+		prog = bpf_object__find_program_by_name(objs[i],
+							"test_obj_id");
 		if (CHECK_FAIL(!prog))
 			goto done;
 		links[i] = bpf_program__attach(prog);
diff --git a/tools/testing/selftests/bpf/prog_tests/connect_force_port.c b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c
index ca574e1e30e6..9c4325f4aef2 100644
--- a/tools/testing/selftests/bpf/prog_tests/connect_force_port.c
+++ b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c
@@ -67,9 +67,9 @@ static int run_test(int cgroup_fd, int server_fd, int family, int type)
 		goto close_bpf_object;
 	}
 
-	prog = bpf_object__find_program_by_title(obj, v4 ?
-						 "cgroup/connect4" :
-						 "cgroup/connect6");
+	prog = bpf_object__find_program_by_name(obj, v4 ?
+						"connect4" :
+						"connect6");
 	if (CHECK(!prog, "find_prog", "connect prog not found\n")) {
 		err = -EIO;
 		goto close_bpf_object;
@@ -83,9 +83,9 @@ static int run_test(int cgroup_fd, int server_fd, int family, int type)
 		goto close_bpf_object;
 	}
 
-	prog = bpf_object__find_program_by_title(obj, v4 ?
-						 "cgroup/getpeername4" :
-						 "cgroup/getpeername6");
+	prog = bpf_object__find_program_by_name(obj, v4 ?
+						"getpeername4" :
+						"getpeername6");
 	if (CHECK(!prog, "find_prog", "getpeername prog not found\n")) {
 		err = -EIO;
 		goto close_bpf_object;
@@ -99,9 +99,9 @@ static int run_test(int cgroup_fd, int server_fd, int family, int type)
 		goto close_bpf_object;
 	}
 
-	prog = bpf_object__find_program_by_title(obj, v4 ?
-						 "cgroup/getsockname4" :
-						 "cgroup/getsockname6");
+	prog = bpf_object__find_program_by_name(obj, v4 ?
+						"getsockname4" :
+						"getsockname6");
 	if (CHECK(!prog, "find_prog", "getsockname prog not found\n")) {
 		err = -EIO;
 		goto close_bpf_object;
diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c
index 44a9868c70ea..b8bdd1c3efca 100644
--- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c
+++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c
@@ -10,7 +10,7 @@ static int duration = 0;
 
 #define STRUCT_TO_CHAR_PTR(struct_name) (const char *)&(struct struct_name)
 
-#define MODULES_CASE(name, sec_name, tp_name) {				\
+#define MODULES_CASE(name, pg_name, tp_name) {				\
 	.case_name = name,						\
 	.bpf_obj_file = "test_core_reloc_module.o",			\
 	.btf_src_file = NULL, /* find in kernel module BTFs */		\
@@ -28,7 +28,7 @@ static int duration = 0;
 		.comm_len = sizeof("test_progs"),			\
 	},								\
 	.output_len = sizeof(struct core_reloc_module_output),		\
-	.prog_sec_name = sec_name,					\
+	.prog_name = pg_name,						\
 	.raw_tp_name = tp_name,						\
 	.trigger = __trigger_module_test_read,				\
 	.needs_testmod = true,						\
@@ -43,7 +43,9 @@ static int duration = 0;
 #define FLAVORS_CASE_COMMON(name)					\
 	.case_name = #name,						\
 	.bpf_obj_file = "test_core_reloc_flavors.o",			\
-	.btf_src_file = "btf__core_reloc_" #name ".o"			\
+	.btf_src_file = "btf__core_reloc_" #name ".o",			\
+	.raw_tp_name = "sys_enter",					\
+	.prog_name = "test_core_flavors"				\
 
 #define FLAVORS_CASE(name) {						\
 	FLAVORS_CASE_COMMON(name),					\
@@ -66,7 +68,9 @@ static int duration = 0;
 #define NESTING_CASE_COMMON(name)					\
 	.case_name = #name,						\
 	.bpf_obj_file = "test_core_reloc_nesting.o",			\
-	.btf_src_file = "btf__core_reloc_" #name ".o"
+	.btf_src_file = "btf__core_reloc_" #name ".o",			\
+	.raw_tp_name = "sys_enter",					\
+	.prog_name = "test_core_nesting"				\
 
 #define NESTING_CASE(name) {						\
 	NESTING_CASE_COMMON(name),					\
@@ -91,7 +95,9 @@ static int duration = 0;
 #define ARRAYS_CASE_COMMON(name)					\
 	.case_name = #name,						\
 	.bpf_obj_file = "test_core_reloc_arrays.o",			\
-	.btf_src_file = "btf__core_reloc_" #name ".o"
+	.btf_src_file = "btf__core_reloc_" #name ".o",			\
+	.raw_tp_name = "sys_enter",					\
+	.prog_name = "test_core_arrays"					\
 
 #define ARRAYS_CASE(name) {						\
 	ARRAYS_CASE_COMMON(name),					\
@@ -123,7 +129,9 @@ static int duration = 0;
 #define PRIMITIVES_CASE_COMMON(name)					\
 	.case_name = #name,						\
 	.bpf_obj_file = "test_core_reloc_primitives.o",			\
-	.btf_src_file = "btf__core_reloc_" #name ".o"
+	.btf_src_file = "btf__core_reloc_" #name ".o",			\
+	.raw_tp_name = "sys_enter",					\
+	.prog_name = "test_core_primitives"				\
 
 #define PRIMITIVES_CASE(name) {						\
 	PRIMITIVES_CASE_COMMON(name),					\
@@ -158,6 +166,8 @@ static int duration = 0;
 		.e = 5, .f = 6, .g = 7, .h = 8,				\
 	},								\
 	.output_len = sizeof(struct core_reloc_mods_output),		\
+	.raw_tp_name = "sys_enter",					\
+	.prog_name = "test_core_mods",					\
 }
 
 #define PTR_AS_ARR_CASE(name) {						\
@@ -174,6 +184,8 @@ static int duration = 0;
 		.a = 3,							\
 	},								\
 	.output_len = sizeof(struct core_reloc_ptr_as_arr),		\
+	.raw_tp_name = "sys_enter",					\
+	.prog_name = "test_core_ptr_as_arr",				\
 }
 
 #define INTS_DATA(struct_name) STRUCT_TO_CHAR_PTR(struct_name) {	\
@@ -190,7 +202,9 @@ static int duration = 0;
 #define INTS_CASE_COMMON(name)						\
 	.case_name = #name,						\
 	.bpf_obj_file = "test_core_reloc_ints.o",			\
-	.btf_src_file = "btf__core_reloc_" #name ".o"
+	.btf_src_file = "btf__core_reloc_" #name ".o",			\
+	.raw_tp_name = "sys_enter",					\
+	.prog_name = "test_core_ints"
 
 #define INTS_CASE(name) {						\
 	INTS_CASE_COMMON(name),						\
@@ -208,7 +222,9 @@ static int duration = 0;
 #define FIELD_EXISTS_CASE_COMMON(name)					\
 	.case_name = #name,						\
 	.bpf_obj_file = "test_core_reloc_existence.o",			\
-	.btf_src_file = "btf__core_reloc_" #name ".o"			\
+	.btf_src_file = "btf__core_reloc_" #name ".o",			\
+	.raw_tp_name = "sys_enter",					\
+	.prog_name = "test_core_existence"
 
 #define BITFIELDS_CASE_COMMON(objfile, test_name_prefix,  name)		\
 	.case_name = test_name_prefix#name,				\
@@ -223,6 +239,8 @@ static int duration = 0;
 	.output = STRUCT_TO_CHAR_PTR(core_reloc_bitfields_output)	\
 		__VA_ARGS__,						\
 	.output_len = sizeof(struct core_reloc_bitfields_output),	\
+	.raw_tp_name = "sys_enter",					\
+	.prog_name = "test_core_bitfields",				\
 }, {									\
 	BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_direct.o",	\
 			      "direct:", name),				\
@@ -231,7 +249,7 @@ static int duration = 0;
 	.output = STRUCT_TO_CHAR_PTR(core_reloc_bitfields_output)	\
 		__VA_ARGS__,						\
 	.output_len = sizeof(struct core_reloc_bitfields_output),	\
-	.prog_sec_name = "tp_btf/sys_enter",				\
+	.prog_name = "test_core_bitfields_direct",			\
 }
 
 
@@ -239,17 +257,21 @@ static int duration = 0;
 	BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_probed.o",	\
 			      "probed:", name),				\
 	.fails = true,							\
+	.raw_tp_name = "sys_enter",					\
+	.prog_name = "test_core_bitfields",				\
 }, {									\
 	BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_direct.o",	\
 			      "direct:", name),				\
-	.prog_sec_name = "tp_btf/sys_enter",				\
 	.fails = true,							\
+	.prog_name = "test_core_bitfields_direct",			\
 }
 
 #define SIZE_CASE_COMMON(name)						\
 	.case_name = #name,						\
 	.bpf_obj_file = "test_core_reloc_size.o",			\
-	.btf_src_file = "btf__core_reloc_" #name ".o"
+	.btf_src_file = "btf__core_reloc_" #name ".o",			\
+	.raw_tp_name = "sys_enter",					\
+	.prog_name = "test_core_size"
 
 #define SIZE_OUTPUT_DATA(type)						\
 	STRUCT_TO_CHAR_PTR(core_reloc_size_output) {			\
@@ -277,8 +299,10 @@ static int duration = 0;
 
 #define TYPE_BASED_CASE_COMMON(name)					\
 	.case_name = #name,						\
-	.bpf_obj_file = "test_core_reloc_type_based.o",		\
-	.btf_src_file = "btf__core_reloc_" #name ".o"			\
+	.bpf_obj_file = "test_core_reloc_type_based.o",			\
+	.btf_src_file = "btf__core_reloc_" #name ".o",			\
+	.raw_tp_name = "sys_enter",					\
+	.prog_name = "test_core_type_based"
 
 #define TYPE_BASED_CASE(name, ...) {					\
 	TYPE_BASED_CASE_COMMON(name),					\
@@ -295,7 +319,9 @@ static int duration = 0;
 #define TYPE_ID_CASE_COMMON(name)					\
 	.case_name = #name,						\
 	.bpf_obj_file = "test_core_reloc_type_id.o",			\
-	.btf_src_file = "btf__core_reloc_" #name ".o"			\
+	.btf_src_file = "btf__core_reloc_" #name ".o",			\
+	.raw_tp_name = "sys_enter",					\
+	.prog_name = "test_core_type_id"
 
 #define TYPE_ID_CASE(name, setup_fn) {					\
 	TYPE_ID_CASE_COMMON(name),					\
@@ -312,7 +338,9 @@ static int duration = 0;
 #define ENUMVAL_CASE_COMMON(name)					\
 	.case_name = #name,						\
 	.bpf_obj_file = "test_core_reloc_enumval.o",			\
-	.btf_src_file = "btf__core_reloc_" #name ".o"			\
+	.btf_src_file = "btf__core_reloc_" #name ".o",			\
+	.raw_tp_name = "sys_enter",					\
+	.prog_name = "test_core_enumval"
 
 #define ENUMVAL_CASE(name, ...) {					\
 	ENUMVAL_CASE_COMMON(name),					\
@@ -342,7 +370,7 @@ struct core_reloc_test_case {
 	bool fails;
 	bool needs_testmod;
 	bool relaxed_core_relocs;
-	const char *prog_sec_name;
+	const char *prog_name;
 	const char *raw_tp_name;
 	setup_test_fn setup;
 	trigger_test_fn trigger;
@@ -497,11 +525,13 @@ static struct core_reloc_test_case test_cases[] = {
 			.comm_len = sizeof("test_progs"),
 		},
 		.output_len = sizeof(struct core_reloc_kernel_output),
+		.raw_tp_name = "sys_enter",
+		.prog_name = "test_core_kernel",
 	},
 
 	/* validate we can find kernel module BTF types for relocs/attach */
-	MODULES_CASE("module_probed", "raw_tp/bpf_testmod_test_read", "bpf_testmod_test_read"),
-	MODULES_CASE("module_direct", "tp_btf/bpf_testmod_test_read", NULL),
+	MODULES_CASE("module_probed", "test_core_module_probed", "bpf_testmod_test_read"),
+	MODULES_CASE("module_direct", "test_core_module_direct", NULL),
 
 	/* validate BPF program can use multiple flavors to match against
 	 * single target BTF type
@@ -580,6 +610,8 @@ static struct core_reloc_test_case test_cases[] = {
 			.c = 0, /* BUG in clang, should be 3 */
 		},
 		.output_len = sizeof(struct core_reloc_misc_output),
+		.raw_tp_name = "sys_enter",
+		.prog_name = "test_core_misc",
 	},
 
 	/* validate field existence checks */
@@ -848,14 +880,9 @@ void test_core_reloc(void)
 		if (!ASSERT_OK_PTR(obj, "obj_open"))
 			goto cleanup;
 
-		probe_name = "raw_tracepoint/sys_enter";
-		tp_name = "sys_enter";
-		if (test_case->prog_sec_name) {
-			probe_name = test_case->prog_sec_name;
-			tp_name = test_case->raw_tp_name; /* NULL for tp_btf */
-		}
-
-		prog = bpf_object__find_program_by_title(obj, probe_name);
+		probe_name = test_case->prog_name;
+		tp_name = test_case->raw_tp_name; /* NULL for tp_btf */
+		prog = bpf_object__find_program_by_name(obj, probe_name);
 		if (CHECK(!prog, "find_probe",
 			  "prog '%s' not found\n", probe_name))
 			goto cleanup;
diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c
index fdd603ebda28..c52f99f6a909 100644
--- a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c
+++ b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c
@@ -101,6 +101,8 @@ static void test_fexit_bpf2bpf_common(const char *obj_file,
 
 	for (i = 0; i < prog_cnt; i++) {
 		struct bpf_link_info link_info;
+		struct bpf_program *pos;
+		const char *pos_sec_name;
 		char *tgt_name;
 		__s32 btf_id;
 
@@ -109,7 +111,14 @@ static void test_fexit_bpf2bpf_common(const char *obj_file,
 			goto close_prog;
 		btf_id = btf__find_by_name_kind(btf, tgt_name + 1, BTF_KIND_FUNC);
 
-		prog[i] = bpf_object__find_program_by_title(obj, prog_name[i]);
+		prog[i] = NULL;
+		bpf_object__for_each_program(pos, obj) {
+			pos_sec_name = bpf_program__section_name(pos);
+			if (pos_sec_name && !strcmp(pos_sec_name, prog_name[i])) {
+				prog[i] = pos;
+				break;
+			}
+		}
 		if (!ASSERT_OK_PTR(prog[i], prog_name[i]))
 			goto close_prog;
 
@@ -211,8 +220,8 @@ static void test_func_replace_verify(void)
 
 static int test_second_attach(struct bpf_object *obj)
 {
-	const char *prog_name = "freplace/get_constant";
-	const char *tgt_name = prog_name + 9; /* cut off freplace/ */
+	const char *prog_name = "security_new_get_constant";
+	const char *tgt_name = "get_constant";
 	const char *tgt_obj_file = "./test_pkt_access.o";
 	struct bpf_program *prog = NULL;
 	struct bpf_object *tgt_obj;
@@ -220,7 +229,7 @@ static int test_second_attach(struct bpf_object *obj)
 	struct bpf_link *link;
 	int err = 0, tgt_fd;
 
-	prog = bpf_object__find_program_by_title(obj, prog_name);
+	prog = bpf_object__find_program_by_name(obj, prog_name);
 	if (CHECK(!prog, "find_prog", "prog %s not found\n", prog_name))
 		return -ENOENT;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c
index 977ab433a946..e834a01de16a 100644
--- a/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c
+++ b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c
@@ -89,7 +89,7 @@ void test_get_stack_raw_tp(void)
 {
 	const char *file = "./test_get_stack_rawtp.o";
 	const char *file_err = "./test_get_stack_rawtp_err.o";
-	const char *prog_name = "raw_tracepoint/sys_enter";
+	const char *prog_name = "bpf_prog1";
 	int i, err, prog_fd, exp_cnt = MAX_CNT_RAWTP;
 	struct perf_buffer *pb = NULL;
 	struct bpf_link *link = NULL;
@@ -107,7 +107,7 @@ void test_get_stack_raw_tp(void)
 	if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
 		return;
 
-	prog = bpf_object__find_program_by_title(obj, prog_name);
+	prog = bpf_object__find_program_by_name(obj, prog_name);
 	if (CHECK(!prog, "find_probe", "prog '%s' not found\n", prog_name))
 		goto close_prog;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c
index 6a953f4adfdc..8ed78a9383ba 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c
@@ -136,7 +136,8 @@ static int start_server(void)
 	return fd;
 }
 
-static int prog_attach(struct bpf_object *obj, int cgroup_fd, const char *title)
+static int prog_attach(struct bpf_object *obj, int cgroup_fd, const char *title,
+		       const char *prog_name)
 {
 	enum bpf_attach_type attach_type;
 	enum bpf_prog_type prog_type;
@@ -145,20 +146,20 @@ static int prog_attach(struct bpf_object *obj, int cgroup_fd, const char *title)
 
 	err = libbpf_prog_type_by_name(title, &prog_type, &attach_type);
 	if (err) {
-		log_err("Failed to deduct types for %s BPF program", title);
+		log_err("Failed to deduct types for %s BPF program", prog_name);
 		return -1;
 	}
 
-	prog = bpf_object__find_program_by_title(obj, title);
+	prog = bpf_object__find_program_by_name(obj, prog_name);
 	if (!prog) {
-		log_err("Failed to find %s BPF program", title);
+		log_err("Failed to find %s BPF program", prog_name);
 		return -1;
 	}
 
 	err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd,
 			      attach_type, 0);
 	if (err) {
-		log_err("Failed to attach %s BPF program", title);
+		log_err("Failed to attach %s BPF program", prog_name);
 		return -1;
 	}
 
@@ -181,11 +182,11 @@ static void run_test(int cgroup_fd)
 	if (!ASSERT_OK(err, "obj_load"))
 		goto close_bpf_object;
 
-	err = prog_attach(obj, cgroup_fd, "cgroup/getsockopt");
+	err = prog_attach(obj, cgroup_fd, "cgroup/getsockopt", "_getsockopt");
 	if (CHECK_FAIL(err))
 		goto close_bpf_object;
 
-	err = prog_attach(obj, cgroup_fd, "cgroup/setsockopt");
+	err = prog_attach(obj, cgroup_fd, "cgroup/setsockopt", "_setsockopt");
 	if (CHECK_FAIL(err))
 		goto close_bpf_object;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c
index 337493d74ec5..313f0a66232e 100644
--- a/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c
+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c
@@ -4,7 +4,7 @@
 void test_stacktrace_map(void)
 {
 	int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
-	const char *prog_name = "tracepoint/sched/sched_switch";
+	const char *prog_name = "oncpu";
 	int err, prog_fd, stack_trace_len;
 	const char *file = "./test_stacktrace_map.o";
 	__u32 key, val, duration = 0;
@@ -16,7 +16,7 @@ void test_stacktrace_map(void)
 	if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
 		return;
 
-	prog = bpf_object__find_program_by_title(obj, prog_name);
+	prog = bpf_object__find_program_by_name(obj, prog_name);
 	if (CHECK(!prog, "find_prog", "prog '%s' not found\n", prog_name))
 		goto close_prog;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c
index 063a14a2060d..1cb8dd36bd8f 100644
--- a/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c
+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c
@@ -3,7 +3,7 @@
 
 void test_stacktrace_map_raw_tp(void)
 {
-	const char *prog_name = "tracepoint/sched/sched_switch";
+	const char *prog_name = "oncpu";
 	int control_map_fd, stackid_hmap_fd, stackmap_fd;
 	const char *file = "./test_stacktrace_map.o";
 	__u32 key, val, duration = 0;
@@ -16,7 +16,7 @@ void test_stacktrace_map_raw_tp(void)
 	if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
 		return;
 
-	prog = bpf_object__find_program_by_title(obj, prog_name);
+	prog = bpf_object__find_program_by_name(obj, prog_name);
 	if (CHECK(!prog, "find_prog", "prog '%s' not found\n", prog_name))
 		goto close_prog;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/test_overhead.c b/tools/testing/selftests/bpf/prog_tests/test_overhead.c
index 123c68c1917d..05acb376f74d 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_overhead.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_overhead.c
@@ -56,11 +56,11 @@ static void setaffinity(void)
 
 void test_test_overhead(void)
 {
-	const char *kprobe_name = "kprobe/__set_task_comm";
-	const char *kretprobe_name = "kretprobe/__set_task_comm";
-	const char *raw_tp_name = "raw_tp/task_rename";
-	const char *fentry_name = "fentry/__set_task_comm";
-	const char *fexit_name = "fexit/__set_task_comm";
+	const char *kprobe_name = "prog1";
+	const char *kretprobe_name = "prog2";
+	const char *raw_tp_name = "prog3";
+	const char *fentry_name = "prog4";
+	const char *fexit_name = "prog5";
 	const char *kprobe_func = "__set_task_comm";
 	struct bpf_program *kprobe_prog, *kretprobe_prog, *raw_tp_prog;
 	struct bpf_program *fentry_prog, *fexit_prog;
@@ -76,23 +76,23 @@ void test_test_overhead(void)
 	if (!ASSERT_OK_PTR(obj, "obj_open_file"))
 		return;
 
-	kprobe_prog = bpf_object__find_program_by_title(obj, kprobe_name);
+	kprobe_prog = bpf_object__find_program_by_name(obj, kprobe_name);
 	if (CHECK(!kprobe_prog, "find_probe",
 		  "prog '%s' not found\n", kprobe_name))
 		goto cleanup;
-	kretprobe_prog = bpf_object__find_program_by_title(obj, kretprobe_name);
+	kretprobe_prog = bpf_object__find_program_by_name(obj, kretprobe_name);
 	if (CHECK(!kretprobe_prog, "find_probe",
 		  "prog '%s' not found\n", kretprobe_name))
 		goto cleanup;
-	raw_tp_prog = bpf_object__find_program_by_title(obj, raw_tp_name);
+	raw_tp_prog = bpf_object__find_program_by_name(obj, raw_tp_name);
 	if (CHECK(!raw_tp_prog, "find_probe",
 		  "prog '%s' not found\n", raw_tp_name))
 		goto cleanup;
-	fentry_prog = bpf_object__find_program_by_title(obj, fentry_name);
+	fentry_prog = bpf_object__find_program_by_name(obj, fentry_name);
 	if (CHECK(!fentry_prog, "find_probe",
 		  "prog '%s' not found\n", fentry_name))
 		goto cleanup;
-	fexit_prog = bpf_object__find_program_by_title(obj, fexit_name);
+	fexit_prog = bpf_object__find_program_by_name(obj, fexit_name);
 	if (CHECK(!fexit_prog, "find_probe",
 		  "prog '%s' not found\n", fexit_name))
 		goto cleanup;
diff --git a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c
index fc146671b20a..9c795ee52b7b 100644
--- a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c
+++ b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c
@@ -35,7 +35,7 @@ static struct bpf_link *load(struct bpf_object *obj, const char *name)
 	struct bpf_program *prog;
 	int duration = 0;
 
-	prog = bpf_object__find_program_by_title(obj, name);
+	prog = bpf_object__find_program_by_name(obj, name);
 	if (CHECK(!prog, "find_probe", "prog '%s' not found\n", name))
 		return ERR_PTR(-EINVAL);
 	return bpf_program__attach_trace(prog);
@@ -44,8 +44,8 @@ static struct bpf_link *load(struct bpf_object *obj, const char *name)
 /* TODO: use different target function to run in concurrent mode */
 void serial_test_trampoline_count(void)
 {
-	const char *fentry_name = "fentry/__set_task_comm";
-	const char *fexit_name = "fexit/__set_task_comm";
+	const char *fentry_name = "prog1";
+	const char *fexit_name = "prog2";
 	const char *object = "test_trampoline_count.o";
 	struct inst inst[MAX_TRAMP_PROGS] = {};
 	int err, i = 0, duration = 0;
-- 
cgit v1.2.3


From b098f33692d75d184a3ab62095c376fd0e52d880 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@fb.com>
Date: Mon, 13 Dec 2021 19:59:30 -0800
Subject: tools/perf: Stop using bpf_object__find_program_by_title API.

bpf_obj__find_program_by_title() in libbpf is going to be deprecated.
Call bpf_object_for_each_program to find a program in the section with
a given name instead.

Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211214035931.1148209-4-kuifeng@fb.com
---
 tools/perf/builtin-trace.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 0b52e08e558e..97121fb45842 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -3257,10 +3257,21 @@ static void trace__set_bpf_map_syscalls(struct trace *trace)
 
 static struct bpf_program *trace__find_bpf_program_by_title(struct trace *trace, const char *name)
 {
+	struct bpf_program *pos, *prog = NULL;
+	const char *sec_name;
+
 	if (trace->bpf_obj == NULL)
 		return NULL;
 
-	return bpf_object__find_program_by_title(trace->bpf_obj, name);
+	bpf_object__for_each_program(pos, trace->bpf_obj) {
+		sec_name = bpf_program__section_name(pos);
+		if (sec_name && !strcmp(sec_name, name)) {
+			prog = pos;
+			break;
+		}
+	}
+
+	return prog;
 }
 
 static struct bpf_program *trace__find_syscall_bpf_prog(struct trace *trace, struct syscall *sc,
-- 
cgit v1.2.3


From 0da2596f343c0b234344ec47d51cbce3bde23dea Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@fb.com>
Date: Mon, 13 Dec 2021 19:59:31 -0800
Subject: libbpf: Mark bpf_object__find_program_by_title API deprecated.

Deprecate this API since v0.7.  All callers should move to
bpf_object__find_program_by_name if possible, otherwise use
bpf_object__for_each_program to find a program out from a given
section.

[0] Closes: https://github.com/libbpf/libbpf/issues/292

Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211214035931.1148209-5-kuifeng@fb.com
---
 tools/lib/bpf/libbpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 3f5ff2c09dfe..42b2f36fd9f0 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -227,6 +227,7 @@ struct btf;
 LIBBPF_API struct btf *bpf_object__btf(const struct bpf_object *obj);
 LIBBPF_API int bpf_object__btf_fd(const struct bpf_object *obj);
 
+LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_object__find_program_by_name() instead")
 LIBBPF_API struct bpf_program *
 bpf_object__find_program_by_title(const struct bpf_object *obj,
 				  const char *title);
-- 
cgit v1.2.3


From 180486b430f4e22cc00a478163d942804baae4b5 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 7 Dec 2021 10:07:04 +0000
Subject: bpf, selftests: Add test case for atomic fetch on spilled pointer

Test whether unprivileged would be able to leak the spilled pointer either
by exporting the returned value from the atomic{32,64} operation or by reading
and exporting the value from the stack after the atomic operation took place.

Note that for unprivileged, the below atomic cmpxchg test case named "Dest
pointer in r0 - succeed" is failing. The reason is that in the dst memory
location (r10 -8) there is the spilled register r10:

  0: R1=ctx(id=0,off=0,imm=0) R10=fp0
  0: (bf) r0 = r10
  1: R0_w=fp0 R1=ctx(id=0,off=0,imm=0) R10=fp0
  1: (7b) *(u64 *)(r10 -8) = r0
  2: R0_w=fp0 R1=ctx(id=0,off=0,imm=0) R10=fp0 fp-8_w=fp
  2: (b7) r1 = 0
  3: R0_w=fp0 R1_w=invP0 R10=fp0 fp-8_w=fp
  3: (db) r0 = atomic64_cmpxchg((u64 *)(r10 -8), r0, r1)
  4: R0_w=fp0 R1_w=invP0 R10=fp0 fp-8_w=mmmmmmmm
  4: (79) r1 = *(u64 *)(r0 -8)
  5: R0_w=fp0 R1_w=invP(id=0) R10=fp0 fp-8_w=mmmmmmmm
  5: (b7) r0 = 0
  6: R0_w=invP0 R1_w=invP(id=0) R10=fp0 fp-8_w=mmmmmmmm
  6: (95) exit

However, allowing this case for unprivileged is a bit useless given an
update with a new pointer will fail anyway:

  0: R1=ctx(id=0,off=0,imm=0) R10=fp0
  0: (bf) r0 = r10
  1: R0_w=fp0 R1=ctx(id=0,off=0,imm=0) R10=fp0
  1: (7b) *(u64 *)(r10 -8) = r0
  2: R0_w=fp0 R1=ctx(id=0,off=0,imm=0) R10=fp0 fp-8_w=fp
  2: (db) r0 = atomic64_cmpxchg((u64 *)(r10 -8), r0, r10)
  R10 leaks addr into mem

Acked-by: Brendan Jackman <jackmanb@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/verifier/atomic_cmpxchg.c        | 23 ++++++
 .../testing/selftests/bpf/verifier/atomic_fetch.c  | 94 ++++++++++++++++++++++
 2 files changed, 117 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c b/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c
index c22dc83a41fd..0ffc69f602af 100644
--- a/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c
+++ b/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c
@@ -156,4 +156,27 @@
 		BPF_EXIT_INSN(),
 	},
 	.result = ACCEPT,
+	.result_unpriv = REJECT,
+	.errstr_unpriv = "leaking pointer from stack off -8",
+},
+{
+	"Dest pointer in r0 - succeed, check 2",
+	.insns = {
+		/* r0 = &val */
+		BPF_MOV64_REG(BPF_REG_0, BPF_REG_10),
+		/* val = r0; */
+		BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
+		/* r5 = &val */
+		BPF_MOV64_REG(BPF_REG_5, BPF_REG_10),
+		/* r0 = atomic_cmpxchg(&val, r0, r5); */
+		BPF_ATOMIC_OP(BPF_DW, BPF_CMPXCHG, BPF_REG_10, BPF_REG_5, -8),
+		/* r1 = *r0 */
+		BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, -8),
+		/* exit(0); */
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.result_unpriv = REJECT,
+	.errstr_unpriv = "R5 leaks addr into mem",
 },
diff --git a/tools/testing/selftests/bpf/verifier/atomic_fetch.c b/tools/testing/selftests/bpf/verifier/atomic_fetch.c
index 3bc9ff7a860b..5bf03fb4fa2b 100644
--- a/tools/testing/selftests/bpf/verifier/atomic_fetch.c
+++ b/tools/testing/selftests/bpf/verifier/atomic_fetch.c
@@ -1,3 +1,97 @@
+{
+	"atomic dw/fetch and address leakage of (map ptr & -1) via stack slot",
+	.insns = {
+		BPF_LD_IMM64(BPF_REG_1, -1),
+		BPF_LD_MAP_FD(BPF_REG_8, 0),
+		BPF_LD_MAP_FD(BPF_REG_9, 0),
+		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+		BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_9, 0),
+		BPF_ATOMIC_OP(BPF_DW, BPF_AND | BPF_FETCH, BPF_REG_2, BPF_REG_1, 0),
+		BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_2, 0),
+		BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+		BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),
+		BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+		BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_9, 0),
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.fixup_map_array_48b = { 2, 4 },
+	.result = ACCEPT,
+	.result_unpriv = REJECT,
+	.errstr_unpriv = "leaking pointer from stack off -8",
+},
+{
+	"atomic dw/fetch and address leakage of (map ptr & -1) via returned value",
+	.insns = {
+		BPF_LD_IMM64(BPF_REG_1, -1),
+		BPF_LD_MAP_FD(BPF_REG_8, 0),
+		BPF_LD_MAP_FD(BPF_REG_9, 0),
+		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+		BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_9, 0),
+		BPF_ATOMIC_OP(BPF_DW, BPF_AND | BPF_FETCH, BPF_REG_2, BPF_REG_1, 0),
+		BPF_MOV64_REG(BPF_REG_9, BPF_REG_1),
+		BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+		BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),
+		BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+		BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_9, 0),
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.fixup_map_array_48b = { 2, 4 },
+	.result = ACCEPT,
+	.result_unpriv = REJECT,
+	.errstr_unpriv = "leaking pointer from stack off -8",
+},
+{
+	"atomic w/fetch and address leakage of (map ptr & -1) via stack slot",
+	.insns = {
+		BPF_LD_IMM64(BPF_REG_1, -1),
+		BPF_LD_MAP_FD(BPF_REG_8, 0),
+		BPF_LD_MAP_FD(BPF_REG_9, 0),
+		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+		BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_9, 0),
+		BPF_ATOMIC_OP(BPF_W, BPF_AND | BPF_FETCH, BPF_REG_2, BPF_REG_1, 0),
+		BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_2, 0),
+		BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+		BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),
+		BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+		BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_9, 0),
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.fixup_map_array_48b = { 2, 4 },
+	.result = REJECT,
+	.errstr = "invalid size of register fill",
+},
+{
+	"atomic w/fetch and address leakage of (map ptr & -1) via returned value",
+	.insns = {
+		BPF_LD_IMM64(BPF_REG_1, -1),
+		BPF_LD_MAP_FD(BPF_REG_8, 0),
+		BPF_LD_MAP_FD(BPF_REG_9, 0),
+		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+		BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_9, 0),
+		BPF_ATOMIC_OP(BPF_W, BPF_AND | BPF_FETCH, BPF_REG_2, BPF_REG_1, 0),
+		BPF_MOV64_REG(BPF_REG_9, BPF_REG_1),
+		BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+		BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),
+		BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+		BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_9, 0),
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.fixup_map_array_48b = { 2, 4 },
+	.result = REJECT,
+	.errstr = "invalid size of register fill",
+},
 #define __ATOMIC_FETCH_OP_TEST(src_reg, dst_reg, operand1, op, operand2, expect) \
 	{								\
 		"atomic fetch " #op ", src=" #dst_reg " dst=" #dst_reg,	\
-- 
cgit v1.2.3


From e523102cb719cbad1673b6aa2a4d5c1fa6f13799 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 13 Dec 2021 22:25:23 +0000
Subject: bpf, selftests: Update test case for atomic cmpxchg on r0 with
 pointer

Fix up unprivileged test case results for 'Dest pointer in r0' verifier tests
given they now need to reject R0 containing a pointer value, and add a couple
of new related ones with 32bit cmpxchg as well.

  root@foo:~/bpf/tools/testing/selftests/bpf# ./test_verifier
  #0/u invalid and of negative number OK
  #0/p invalid and of negative number OK
  [...]
  #1268/p XDP pkt read, pkt_meta' <= pkt_data, bad access 1 OK
  #1269/p XDP pkt read, pkt_meta' <= pkt_data, bad access 2 OK
  #1270/p XDP pkt read, pkt_data <= pkt_meta', good access OK
  #1271/p XDP pkt read, pkt_data <= pkt_meta', bad access 1 OK
  #1272/p XDP pkt read, pkt_data <= pkt_meta', bad access 2 OK
  Summary: 1900 PASSED, 0 SKIPPED, 0 FAILED

Acked-by: Brendan Jackman <jackmanb@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/verifier/atomic_cmpxchg.c        | 67 +++++++++++++++++++++-
 1 file changed, 65 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c b/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c
index 0ffc69f602af..b39665f33524 100644
--- a/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c
+++ b/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c
@@ -138,6 +138,8 @@
 		BPF_EXIT_INSN(),
 	},
 	.result = ACCEPT,
+	.result_unpriv = REJECT,
+	.errstr_unpriv = "R0 leaks addr into mem",
 },
 {
 	"Dest pointer in r0 - succeed",
@@ -157,7 +159,7 @@
 	},
 	.result = ACCEPT,
 	.result_unpriv = REJECT,
-	.errstr_unpriv = "leaking pointer from stack off -8",
+	.errstr_unpriv = "R0 leaks addr into mem",
 },
 {
 	"Dest pointer in r0 - succeed, check 2",
@@ -178,5 +180,66 @@
 	},
 	.result = ACCEPT,
 	.result_unpriv = REJECT,
-	.errstr_unpriv = "R5 leaks addr into mem",
+	.errstr_unpriv = "R0 leaks addr into mem",
+},
+{
+	"Dest pointer in r0 - succeed, check 3",
+	.insns = {
+		/* r0 = &val */
+		BPF_MOV64_REG(BPF_REG_0, BPF_REG_10),
+		/* val = r0; */
+		BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
+		/* r5 = &val */
+		BPF_MOV64_REG(BPF_REG_5, BPF_REG_10),
+		/* r0 = atomic_cmpxchg(&val, r0, r5); */
+		BPF_ATOMIC_OP(BPF_W, BPF_CMPXCHG, BPF_REG_10, BPF_REG_5, -8),
+		/* exit(0); */
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.result = REJECT,
+	.errstr = "invalid size of register fill",
+	.errstr_unpriv = "R0 leaks addr into mem",
+},
+{
+	"Dest pointer in r0 - succeed, check 4",
+	.insns = {
+		/* r0 = &val */
+		BPF_MOV32_REG(BPF_REG_0, BPF_REG_10),
+		/* val = r0; */
+		BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -8),
+		/* r5 = &val */
+		BPF_MOV32_REG(BPF_REG_5, BPF_REG_10),
+		/* r0 = atomic_cmpxchg(&val, r0, r5); */
+		BPF_ATOMIC_OP(BPF_W, BPF_CMPXCHG, BPF_REG_10, BPF_REG_5, -8),
+		/* r1 = *r10 */
+		BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_10, -8),
+		/* exit(0); */
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.result_unpriv = REJECT,
+	.errstr_unpriv = "R10 partial copy of pointer",
+},
+{
+	"Dest pointer in r0 - succeed, check 5",
+	.insns = {
+		/* r0 = &val */
+		BPF_MOV32_REG(BPF_REG_0, BPF_REG_10),
+		/* val = r0; */
+		BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -8),
+		/* r5 = &val */
+		BPF_MOV32_REG(BPF_REG_5, BPF_REG_10),
+		/* r0 = atomic_cmpxchg(&val, r0, r5); */
+		BPF_ATOMIC_OP(BPF_W, BPF_CMPXCHG, BPF_REG_10, BPF_REG_5, -8),
+		/* r1 = *r0 */
+		BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, -8),
+		/* exit(0); */
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.result = REJECT,
+	.errstr = "R0 invalid mem access",
+	.errstr_unpriv = "R10 partial copy of pointer",
 },
-- 
cgit v1.2.3


From f97982398cc1c92f2e9bd0ef1ef870a5a729b0ac Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 14 Dec 2021 15:20:54 -0800
Subject: libbpf: Avoid reading past ELF data section end when copying license

Fix possible read beyond ELF "license" data section if the license
string is not properly zero-terminated. Use the fact that libbpf_strlcpy
never accesses the (N-1)st byte of the source string because it's
replaced with '\0' anyways.

If this happens, it's a violation of contract between libbpf and a user,
but not handling this more robustly upsets CIFuzz, so given the fix is
trivial, let's fix the potential issue.

Fixes: 9fc205b413b3 ("libbpf: Add sane strncpy alternative and use it internally")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211214232054.3458774-1-andrii@kernel.org
---
 tools/lib/bpf/libbpf.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 684c34ce1a26..cf862a19222b 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1320,7 +1320,10 @@ static int bpf_object__check_endianness(struct bpf_object *obj)
 static int
 bpf_object__init_license(struct bpf_object *obj, void *data, size_t size)
 {
-	libbpf_strlcpy(obj->license, data, sizeof(obj->license));
+	/* libbpf_strlcpy() only copies first N - 1 bytes, so size + 1 won't
+	 * go over allowed ELF data section buffer
+	 */
+	libbpf_strlcpy(obj->license, data, min(size + 1, sizeof(obj->license)));
 	pr_debug("license of %s is %s\n", obj->path, obj->license);
 	return 0;
 }
-- 
cgit v1.2.3


From fb488be8c28de63c6298b74d2cc422cf8ae39716 Mon Sep 17 00:00:00 2001
From: Amit Cohen <amcohen@nvidia.com>
Date: Tue, 14 Dec 2021 16:25:51 +0200
Subject: selftests: mlxsw: vxlan: Remove IPv6 test case

Currently, there is a test case to verify that VxLAN with IPv6 underlay
is forbidden.

Remove this test case as support for VxLAN with IPv6 underlay was added
by the previous patch.

Signed-off-by: Amit Cohen <amcohen@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/drivers/net/mlxsw/vxlan.sh | 18 ------------------
 1 file changed, 18 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/mlxsw/vxlan.sh b/tools/testing/selftests/drivers/net/mlxsw/vxlan.sh
index 729a86cc4ede..3639b89c81ba 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/vxlan.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/vxlan.sh
@@ -145,23 +145,6 @@ sanitization_single_dev_no_local_ip_test()
 	log_test "vxlan device with no local ip"
 }
 
-sanitization_single_dev_local_ipv6_test()
-{
-	RET=0
-
-	ip link add dev br0 type bridge mcast_snooping 0
-
-	ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
-		ttl 20 tos inherit local 2001:db8::1 dstport 4789
-
-	sanitization_single_dev_test_fail
-
-	ip link del dev vxlan0
-	ip link del dev br0
-
-	log_test "vxlan device with local ipv6 address"
-}
-
 sanitization_single_dev_learning_enabled_test()
 {
 	RET=0
@@ -276,7 +259,6 @@ sanitization_single_dev_test()
 	sanitization_single_dev_mcast_enabled_test
 	sanitization_single_dev_mcast_group_test
 	sanitization_single_dev_no_local_ip_test
-	sanitization_single_dev_local_ipv6_test
 	sanitization_single_dev_learning_enabled_test
 	sanitization_single_dev_local_interface_test
 	sanitization_single_dev_port_range_test
-- 
cgit v1.2.3


From 1fb466dff904e4a72282af336f2c355f011eec61 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 15 Dec 2021 11:24:50 -0600
Subject: objtool: Add a missing comma to avoid string concatenation

Recently the kbuild robot reported two new errors:

>> lib/kunit/kunit-example-test.o: warning: objtool: .text.unlikely: unexpected end of section
>> arch/x86/kernel/dumpstack.o: warning: objtool: oops_end() falls through to next function show_opcodes()

I don't know why they did not occur in my test setup but after digging
it I realized I had accidentally dropped a comma in
tools/objtool/check.c when I renamed rewind_stack_do_exit to
rewind_stack_and_make_dead.

Add that comma back to fix objtool errors.

Link: https://lkml.kernel.org/r/202112140949.Uq5sFKR1-lkp@intel.com
Fixes: 0e25498f8cd4 ("exit: Add and use make_task_dead.")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 tools/objtool/check.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 282273a1ffa5..3fc2c57f6124 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -177,7 +177,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func,
 		"fortify_panic",
 		"usercopy_abort",
 		"machine_real_restart",
-		"rewind_stack_and_make_dead"
+		"rewind_stack_and_make_dead",
 		"kunit_try_catch_throw",
 		"xen_start_kernel",
 		"cpu_bringup_and_idle",
-- 
cgit v1.2.3


From db1679813f9f86b05bbbc6f05f4cdbe481d59352 Mon Sep 17 00:00:00 2001
From: Daniel Latypov <dlatypov@google.com>
Date: Tue, 14 Dec 2021 11:26:11 -0800
Subject: kunit: tool: use dataclass instead of collections.namedtuple

namedtuple is a terse way of defining a collection of fields.
However, it does not allow us to annotate the type of these fields.
It also doesn't let us have any sort of inheritance between types.

Since commit df4b0807ca1a ("kunit: tool: Assert the version
requirement"), kunit.py has asserted that it's running on python >=3.7.

So in that case use a 3.7 feature, dataclasses, to replace these.

Changes in detail:
* Make KunitExecRequest contain all the fields needed for exec_tests
* Use inheritance to dedupe fields
  * also allows us to e.g. pass a KUnitRequest in as a KUnitParseRequest
  * this has changed around the order of some fields
* Use named arguments when constructing all request objects in kunit.py
  * This is to prevent accidentally mixing up fields, etc.

Signed-off-by: Daniel Latypov <dlatypov@google.com>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit.py           | 139 +++++++++++++++++----------------
 tools/testing/kunit/kunit_tool_test.py |   6 +-
 2 files changed, 75 insertions(+), 70 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py
index 8c52faf5787c..4f5c7984cf3f 100755
--- a/tools/testing/kunit/kunit.py
+++ b/tools/testing/kunit/kunit.py
@@ -15,38 +15,57 @@ import time
 
 assert sys.version_info >= (3, 7), "Python version is too old"
 
-from collections import namedtuple
+from dataclasses import dataclass
 from enum import Enum, auto
-from typing import Iterable, Sequence, List
+from typing import Any, Iterable, Sequence, List, Optional
 
 import kunit_json
 import kunit_kernel
 import kunit_parser
 
-KunitResult = namedtuple('KunitResult', ['status','result','elapsed_time'])
-
-KunitConfigRequest = namedtuple('KunitConfigRequest',
-				['build_dir', 'make_options'])
-KunitBuildRequest = namedtuple('KunitBuildRequest',
-			       ['jobs', 'build_dir', 'alltests',
-				'make_options'])
-KunitExecRequest = namedtuple('KunitExecRequest',
-			      ['timeout', 'build_dir', 'alltests',
-			       'filter_glob', 'kernel_args', 'run_isolated'])
-KunitParseRequest = namedtuple('KunitParseRequest',
-			       ['raw_output', 'build_dir', 'json'])
-KunitRequest = namedtuple('KunitRequest', ['raw_output','timeout', 'jobs',
-					   'build_dir', 'alltests', 'filter_glob',
-					   'kernel_args', 'run_isolated', 'json', 'make_options'])
-
-KernelDirectoryPath = sys.argv[0].split('tools/testing/kunit/')[0]
-
 class KunitStatus(Enum):
 	SUCCESS = auto()
 	CONFIG_FAILURE = auto()
 	BUILD_FAILURE = auto()
 	TEST_FAILURE = auto()
 
+@dataclass
+class KunitResult:
+	status: KunitStatus
+	result: Any
+	elapsed_time: float
+
+@dataclass
+class KunitConfigRequest:
+	build_dir: str
+	make_options: Optional[List[str]]
+
+@dataclass
+class KunitBuildRequest(KunitConfigRequest):
+	jobs: int
+	alltests: bool
+
+@dataclass
+class KunitParseRequest:
+	raw_output: Optional[str]
+	build_dir: str
+	json: Optional[str]
+
+@dataclass
+class KunitExecRequest(KunitParseRequest):
+	timeout: int
+	alltests: bool
+	filter_glob: str
+	kernel_args: Optional[List[str]]
+	run_isolated: Optional[str]
+
+@dataclass
+class KunitRequest(KunitExecRequest, KunitBuildRequest):
+	pass
+
+
+KernelDirectoryPath = sys.argv[0].split('tools/testing/kunit/')[0]
+
 def get_kernel_root_path() -> str:
 	path = sys.argv[0] if not __file__ else __file__
 	parts = os.path.realpath(path).split('tools/testing/kunit')
@@ -121,8 +140,7 @@ def _suites_from_test_list(tests: List[str]) -> List[str]:
 
 
-def exec_tests(linux: kunit_kernel.LinuxSourceTree, request: KunitExecRequest,
-	       parse_request: KunitParseRequest) -> KunitResult:
+def exec_tests(linux: kunit_kernel.LinuxSourceTree, request: KunitExecRequest) -> KunitResult:
 	filter_globs = [request.filter_glob]
 	if request.run_isolated:
 		tests = _list_tests(linux, request)
@@ -147,7 +165,7 @@ def exec_tests(linux: kunit_kernel.LinuxSourceTree, request: KunitExecRequest,
 			filter_glob=filter_glob,
 			build_dir=request.build_dir)
 
-		result = parse_tests(parse_request, run_result)
+		result = parse_tests(request, run_result)
 		# run_kernel() doesn't block on the kernel exiting.
 		# That only happens after we get the last line of output from `run_result`.
 		# So exec_time here actually contains parsing + execution time, which is fine.
@@ -217,27 +235,15 @@ def run_tests(linux: kunit_kernel.LinuxSourceTree,
 	      request: KunitRequest) -> KunitResult:
 	run_start = time.time()
 
-	config_request = KunitConfigRequest(request.build_dir,
-					    request.make_options)
-	config_result = config_tests(linux, config_request)
+	config_result = config_tests(linux, request)
 	if config_result.status != KunitStatus.SUCCESS:
 		return config_result
 
-	build_request = KunitBuildRequest(request.jobs, request.build_dir,
-					  request.alltests,
-					  request.make_options)
-	build_result = build_tests(linux, build_request)
+	build_result = build_tests(linux, request)
 	if build_result.status != KunitStatus.SUCCESS:
 		return build_result
 
-	exec_request = KunitExecRequest(request.timeout, request.build_dir,
-				 request.alltests, request.filter_glob,
-				 request.kernel_args, request.run_isolated)
-	parse_request = KunitParseRequest(request.raw_output,
-					  request.build_dir,
-					  request.json)
-
-	exec_result = exec_tests(linux, exec_request, parse_request)
+	exec_result = exec_tests(linux, request)
 
 	run_end = time.time()
 
@@ -413,16 +419,16 @@ def main(argv, linux=None):
 					cross_compile=cli_args.cross_compile,
 					qemu_config_path=cli_args.qemu_config)
 
-		request = KunitRequest(cli_args.raw_output,
-				       cli_args.timeout,
-				       cli_args.jobs,
-				       cli_args.build_dir,
-				       cli_args.alltests,
-				       cli_args.filter_glob,
-				       cli_args.kernel_args,
-				       cli_args.run_isolated,
-				       cli_args.json,
-				       cli_args.make_options)
+		request = KunitRequest(build_dir=cli_args.build_dir,
+				       make_options=cli_args.make_options,
+				       jobs=cli_args.jobs,
+				       alltests=cli_args.alltests,
+				       raw_output=cli_args.raw_output,
+				       json=cli_args.json,
+				       timeout=cli_args.timeout,
+				       filter_glob=cli_args.filter_glob,
+				       kernel_args=cli_args.kernel_args,
+				       run_isolated=cli_args.run_isolated)
 		result = run_tests(linux, request)
 		if result.status != KunitStatus.SUCCESS:
 			sys.exit(1)
@@ -439,8 +445,8 @@ def main(argv, linux=None):
 					cross_compile=cli_args.cross_compile,
 					qemu_config_path=cli_args.qemu_config)
 
-		request = KunitConfigRequest(cli_args.build_dir,
-					     cli_args.make_options)
+		request = KunitConfigRequest(build_dir=cli_args.build_dir,
+					     make_options=cli_args.make_options)
 		result = config_tests(linux, request)
 		kunit_parser.print_with_timestamp((
 			'Elapsed time: %.3fs\n') % (
@@ -456,10 +462,10 @@ def main(argv, linux=None):
 					cross_compile=cli_args.cross_compile,
 					qemu_config_path=cli_args.qemu_config)
 
-		request = KunitBuildRequest(cli_args.jobs,
-					    cli_args.build_dir,
-					    cli_args.alltests,
-					    cli_args.make_options)
+		request = KunitBuildRequest(build_dir=cli_args.build_dir,
+					    make_options=cli_args.make_options,
+					    jobs=cli_args.jobs,
+					    alltests=cli_args.alltests)
 		result = build_tests(linux, request)
 		kunit_parser.print_with_timestamp((
 			'Elapsed time: %.3fs\n') % (
@@ -475,16 +481,15 @@ def main(argv, linux=None):
 					cross_compile=cli_args.cross_compile,
 					qemu_config_path=cli_args.qemu_config)
 
-		exec_request = KunitExecRequest(cli_args.timeout,
-						cli_args.build_dir,
-						cli_args.alltests,
-						cli_args.filter_glob,
-						cli_args.kernel_args,
-						cli_args.run_isolated)
-		parse_request = KunitParseRequest(cli_args.raw_output,
-						  cli_args.build_dir,
-						  cli_args.json)
-		result = exec_tests(linux, exec_request, parse_request)
+		exec_request = KunitExecRequest(raw_output=cli_args.raw_output,
+						build_dir=cli_args.build_dir,
+						json=cli_args.json,
+						timeout=cli_args.timeout,
+						alltests=cli_args.alltests,
+						filter_glob=cli_args.filter_glob,
+						kernel_args=cli_args.kernel_args,
+						run_isolated=cli_args.run_isolated)
+		result = exec_tests(linux, exec_request)
 		kunit_parser.print_with_timestamp((
 			'Elapsed time: %.3fs\n') % (result.elapsed_time))
 		if result.status != KunitStatus.SUCCESS:
@@ -496,9 +501,9 @@ def main(argv, linux=None):
 		else:
 			with open(cli_args.file, 'r', errors='backslashreplace') as f:
 				kunit_output = f.read().splitlines()
-		request = KunitParseRequest(cli_args.raw_output,
-					    None,
-					    cli_args.json)
+		request = KunitParseRequest(raw_output=cli_args.raw_output,
+					    build_dir='',
+					    json=cli_args.json)
 		result = parse_tests(request, kunit_output)
 		if result.status != KunitStatus.SUCCESS:
 			sys.exit(1)
diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py
index 97a4c72af2b8..e30e162bd084 100755
--- a/tools/testing/kunit/kunit_tool_test.py
+++ b/tools/testing/kunit/kunit_tool_test.py
@@ -691,7 +691,7 @@ class KUnitMainTest(unittest.TestCase):
 		self.linux_source_mock.run_kernel.return_value = ['TAP version 14', 'init: random output'] + want
 
 		got = kunit._list_tests(self.linux_source_mock,
-				     kunit.KunitExecRequest(300, '.kunit', False, 'suite*', None, 'suite'))
+				     kunit.KunitExecRequest(None, '.kunit', None, 300, False, 'suite*', None, 'suite'))
 
 		self.assertEqual(got, want)
 		# Should respect the user's filter glob when listing tests.
@@ -706,7 +706,7 @@ class KUnitMainTest(unittest.TestCase):
 
 		# Should respect the user's filter glob when listing tests.
 		mock_tests.assert_called_once_with(mock.ANY,
-				     kunit.KunitExecRequest(300, '.kunit', False, 'suite*.test*', None, 'suite'))
+				     kunit.KunitExecRequest(None, '.kunit', None, 300, False, 'suite*.test*', None, 'suite'))
 		self.linux_source_mock.run_kernel.assert_has_calls([
 			mock.call(args=None, build_dir='.kunit', filter_glob='suite.test*', timeout=300),
 			mock.call(args=None, build_dir='.kunit', filter_glob='suite2.test*', timeout=300),
@@ -719,7 +719,7 @@ class KUnitMainTest(unittest.TestCase):
 
 		# Should respect the user's filter glob when listing tests.
 		mock_tests.assert_called_once_with(mock.ANY,
-				     kunit.KunitExecRequest(300, '.kunit', False, 'suite*', None, 'test'))
+				     kunit.KunitExecRequest(None, '.kunit', None, 300, False, 'suite*', None, 'test'))
 		self.linux_source_mock.run_kernel.assert_has_calls([
 			mock.call(args=None, build_dir='.kunit', filter_glob='suite.test1', timeout=300),
 			mock.call(args=None, build_dir='.kunit', filter_glob='suite.test2', timeout=300),
-- 
cgit v1.2.3


From e0cc8c052a3992b01f51df1d51eaae49e5b2710f Mon Sep 17 00:00:00 2001
From: Daniel Latypov <dlatypov@google.com>
Date: Tue, 14 Dec 2021 11:26:12 -0800
Subject: kunit: tool: delete kunit_parser.TestResult type

The `log` field is unused, and the `status` field is accessible via
`test.status`.

So it's simpler to just return the main `Test` object directly.

And since we're no longer returning a namedtuple, which has no type
annotations, this hopefully means typecheckers are better equipped to
find any errors.

Signed-off-by: Daniel Latypov <dlatypov@google.com>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit.py           | 14 ++++++--------
 tools/testing/kunit/kunit_json.py      |  6 +++---
 tools/testing/kunit/kunit_parser.py    | 10 +++-------
 tools/testing/kunit/kunit_tool_test.py | 34 +++++++++++++++++-----------------
 4 files changed, 29 insertions(+), 35 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py
index 4f5c7984cf3f..417dc2d11f4f 100755
--- a/tools/testing/kunit/kunit.py
+++ b/tools/testing/kunit/kunit.py
@@ -172,7 +172,7 @@ def exec_tests(linux: kunit_kernel.LinuxSourceTree, request: KunitExecRequest) -
 		test_end = time.time()
 		exec_time += test_end - test_start
 
-		test_counts.add_subtest_counts(result.result.test.counts)
+		test_counts.add_subtest_counts(result.result.counts)
 
 	if len(filter_globs) == 1 and test_counts.crashed > 0:
 		bd = request.build_dir
@@ -181,7 +181,7 @@ def exec_tests(linux: kunit_kernel.LinuxSourceTree, request: KunitExecRequest) -
 				bd, bd, kunit_kernel.get_outfile_path(bd), bd, sys.argv[0]))
 
 	kunit_status = _map_to_overall_status(test_counts.get_status())
-	return KunitResult(status=kunit_status, result=result.result, elapsed_time=exec_time)
+	return KunitResult(status=kunit_status, result=result, elapsed_time=exec_time)
 
 def _map_to_overall_status(test_status: kunit_parser.TestStatus) -> KunitStatus:
 	if test_status in (kunit_parser.TestStatus.SUCCESS, kunit_parser.TestStatus.SKIPPED):
@@ -192,14 +192,12 @@ def _map_to_overall_status(test_status: kunit_parser.TestStatus) -> KunitStatus:
 def parse_tests(request: KunitParseRequest, input_data: Iterable[str]) -> KunitResult:
 	parse_start = time.time()
 
-	test_result = kunit_parser.TestResult(kunit_parser.TestStatus.SUCCESS,
-					      kunit_parser.Test(),
-					      'Tests not Parsed.')
+	test_result = kunit_parser.Test()
 
 	if request.raw_output:
 		# Treat unparsed results as one passing test.
-		test_result.test.status = kunit_parser.TestStatus.SUCCESS
-		test_result.test.counts.passed = 1
+		test_result.status = kunit_parser.TestStatus.SUCCESS
+		test_result.counts.passed = 1
 
 		output: Iterable[str] = input_data
 		if request.raw_output == 'all':
@@ -217,7 +215,7 @@ def parse_tests(request: KunitParseRequest, input_data: Iterable[str]) -> KunitR
 
 	if request.json:
 		json_obj = kunit_json.get_json_result(
-					test_result=test_result,
+					test=test_result,
 					def_config='kunit_defconfig',
 					build_dir=request.build_dir,
 					json_path=request.json)
diff --git a/tools/testing/kunit/kunit_json.py b/tools/testing/kunit/kunit_json.py
index b6e66c5d64d1..6862671709bc 100644
--- a/tools/testing/kunit/kunit_json.py
+++ b/tools/testing/kunit/kunit_json.py
@@ -11,7 +11,7 @@ import os
 
 import kunit_parser
 
-from kunit_parser import Test, TestResult, TestStatus
+from kunit_parser import Test, TestStatus
 from typing import Any, Dict, Optional
 
 JsonObj = Dict[str, Any]
@@ -50,9 +50,9 @@ def _get_group_json(test: Test, def_config: str,
 	}
 	return test_group
 
-def get_json_result(test_result: TestResult, def_config: str,
+def get_json_result(test: Test, def_config: str,
 		build_dir: Optional[str], json_path: str) -> str:
-	test_group = _get_group_json(test_result.test, def_config, build_dir)
+	test_group = _get_group_json(test, def_config, build_dir)
 	test_group["name"] = "KUnit Test Group"
 	json_obj = json.dumps(test_group, indent=4)
 	if json_path != 'stdout':
diff --git a/tools/testing/kunit/kunit_parser.py b/tools/testing/kunit/kunit_parser.py
index 8e42b6ef3fe3..66a7f2fb314a 100644
--- a/tools/testing/kunit/kunit_parser.py
+++ b/tools/testing/kunit/kunit_parser.py
@@ -12,14 +12,11 @@
 from __future__ import annotations
 import re
 
-from collections import namedtuple
 from datetime import datetime
 from enum import Enum, auto
 from functools import reduce
 from typing import Iterable, Iterator, List, Optional, Tuple
 
-TestResult = namedtuple('TestResult', ['status','test','log'])
-
 class Test(object):
 	"""
 	A class to represent a test parsed from KTAP results. All KTAP
@@ -805,7 +802,7 @@ def parse_test(lines: LineStream, expected_num: int, log: List[str]) -> Test:
 		print_test_result(test)
 	return test
 
-def parse_run_tests(kernel_output: Iterable[str]) -> TestResult:
+def parse_run_tests(kernel_output: Iterable[str]) -> Test:
 	"""
 	Using kernel output, extract KTAP lines, parse the lines for test
 	results and print condensed test results and summary line .
@@ -814,8 +811,7 @@ def parse_run_tests(kernel_output: Iterable[str]) -> TestResult:
 	kernel_output - Iterable object contains lines of kernel output
 
 	Return:
-	TestResult - Tuple containg status of main test object, main test
-		object with all subtests, and log of all KTAP lines.
+	Test - the main test object with all subtests.
 	"""
 	print_with_timestamp(DIVIDER)
 	lines = extract_tap_lines(kernel_output)
@@ -829,4 +825,4 @@ def parse_run_tests(kernel_output: Iterable[str]) -> TestResult:
 			test.status = test.counts.get_status()
 	print_with_timestamp(DIVIDER)
 	print_summary_line(test)
-	return TestResult(test.status, test, lines)
+	return test
diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py
index e30e162bd084..1f6b177ca5c2 100755
--- a/tools/testing/kunit/kunit_tool_test.py
+++ b/tools/testing/kunit/kunit_tool_test.py
@@ -178,7 +178,7 @@ class KUnitParserTest(unittest.TestCase):
 		with open(empty_log) as file:
 			result = kunit_parser.parse_run_tests(
 				kunit_parser.extract_tap_lines(file.readlines()))
-		self.assertEqual(0, len(result.test.subtests))
+		self.assertEqual(0, len(result.subtests))
 		self.assertEqual(
 			kunit_parser.TestStatus.FAILURE_TO_PARSE_TESTS,
 			result.status)
@@ -191,9 +191,9 @@ class KUnitParserTest(unittest.TestCase):
 				kunit_parser.extract_tap_lines(
 				file.readlines()))
 		# A missing test plan is not an error.
-		self.assertEqual(0, result.test.counts.errors)
+		self.assertEqual(0, result.counts.errors)
 		# All tests should be accounted for.
-		self.assertEqual(10, result.test.counts.total())
+		self.assertEqual(10, result.counts.total())
 		self.assertEqual(
 			kunit_parser.TestStatus.SUCCESS,
 			result.status)
@@ -203,7 +203,7 @@ class KUnitParserTest(unittest.TestCase):
 		with open(header_log) as file:
 			result = kunit_parser.parse_run_tests(
 				kunit_parser.extract_tap_lines(file.readlines()))
-		self.assertEqual(0, len(result.test.subtests))
+		self.assertEqual(0, len(result.subtests))
 		self.assertEqual(
 			kunit_parser.TestStatus.NO_TESTS,
 			result.status)
@@ -213,11 +213,11 @@ class KUnitParserTest(unittest.TestCase):
 		with open(no_plan_log) as file:
 			result = kunit_parser.parse_run_tests(
 				kunit_parser.extract_tap_lines(file.readlines()))
-		self.assertEqual(0, len(result.test.subtests[0].subtests[0].subtests))
+		self.assertEqual(0, len(result.subtests[0].subtests[0].subtests))
 		self.assertEqual(
 			kunit_parser.TestStatus.NO_TESTS,
-			result.test.subtests[0].subtests[0].status)
-		self.assertEqual(1, result.test.counts.errors)
+			result.subtests[0].subtests[0].status)
+		self.assertEqual(1, result.counts.errors)
 
 
 	def test_no_kunit_output(self):
@@ -228,7 +228,7 @@ class KUnitParserTest(unittest.TestCase):
 				kunit_parser.extract_tap_lines(file.readlines()))
 		print_mock.assert_any_call(StrContains('invalid KTAP input!'))
 		print_mock.stop()
-		self.assertEqual(0, len(result.test.subtests))
+		self.assertEqual(0, len(result.subtests))
 
 	def test_crashed_test(self):
 		crashed_log = test_data_path('test_is_test_passed-crash.log')
@@ -269,10 +269,10 @@ class KUnitParserTest(unittest.TestCase):
 			result.status)
 		self.assertEqual(
 			"sysctl_test",
-			result.test.subtests[0].name)
+			result.subtests[0].name)
 		self.assertEqual(
 			"example",
-			result.test.subtests[1].name)
+			result.subtests[1].name)
 		file.close()
 
 
@@ -283,7 +283,7 @@ class KUnitParserTest(unittest.TestCase):
 			self.assertEqual(
 				kunit_parser.TestStatus.SUCCESS,
 				result.status)
-			self.assertEqual('kunit-resource-test', result.test.subtests[0].name)
+			self.assertEqual('kunit-resource-test', result.subtests[0].name)
 
 	def test_ignores_multiple_prefixes(self):
 		prefix_log = test_data_path('test_multiple_prefixes.log')
@@ -292,7 +292,7 @@ class KUnitParserTest(unittest.TestCase):
 			self.assertEqual(
 				kunit_parser.TestStatus.SUCCESS,
 				result.status)
-			self.assertEqual('kunit-resource-test', result.test.subtests[0].name)
+			self.assertEqual('kunit-resource-test', result.subtests[0].name)
 
 	def test_prefix_mixed_kernel_output(self):
 		mixed_prefix_log = test_data_path('test_interrupted_tap_output.log')
@@ -301,7 +301,7 @@ class KUnitParserTest(unittest.TestCase):
 			self.assertEqual(
 				kunit_parser.TestStatus.SUCCESS,
 				result.status)
-			self.assertEqual('kunit-resource-test', result.test.subtests[0].name)
+			self.assertEqual('kunit-resource-test', result.subtests[0].name)
 
 	def test_prefix_poundsign(self):
 		pound_log = test_data_path('test_pound_sign.log')
@@ -310,7 +310,7 @@ class KUnitParserTest(unittest.TestCase):
 			self.assertEqual(
 				kunit_parser.TestStatus.SUCCESS,
 				result.status)
-			self.assertEqual('kunit-resource-test', result.test.subtests[0].name)
+			self.assertEqual('kunit-resource-test', result.subtests[0].name)
 
 	def test_kernel_panic_end(self):
 		panic_log = test_data_path('test_kernel_panic_interrupt.log')
@@ -319,7 +319,7 @@ class KUnitParserTest(unittest.TestCase):
 			self.assertEqual(
 				kunit_parser.TestStatus.TEST_CRASHED,
 				result.status)
-			self.assertEqual('kunit-resource-test', result.test.subtests[0].name)
+			self.assertEqual('kunit-resource-test', result.subtests[0].name)
 
 	def test_pound_no_prefix(self):
 		pound_log = test_data_path('test_pound_no_prefix.log')
@@ -328,7 +328,7 @@ class KUnitParserTest(unittest.TestCase):
 			self.assertEqual(
 				kunit_parser.TestStatus.SUCCESS,
 				result.status)
-			self.assertEqual('kunit-resource-test', result.test.subtests[0].name)
+			self.assertEqual('kunit-resource-test', result.subtests[0].name)
 
 def line_stream_from_strs(strs: Iterable[str]) -> kunit_parser.LineStream:
 	return kunit_parser.LineStream(enumerate(strs, start=1))
@@ -467,7 +467,7 @@ class KUnitJsonTest(unittest.TestCase):
 		with open(test_data_path(log_file)) as file:
 			test_result = kunit_parser.parse_run_tests(file)
 			json_obj = kunit_json.get_json_result(
-				test_result=test_result,
+				test=test_result,
 				def_config='kunit_defconfig',
 				build_dir=None,
 				json_path='stdout')
-- 
cgit v1.2.3


From 1ee2ba89bea86d6389509e426583b49ac19b86f2 Mon Sep 17 00:00:00 2001
From: Daniel Latypov <dlatypov@google.com>
Date: Tue, 14 Dec 2021 11:30:10 -0800
Subject: kunit: tool: make `build` subcommand also reconfigure if needed

If I created a kunitconfig file that was incomplete, then
$ ./tools/testing/kunit/kunit.py build --kunitconfig=my_kunitconfig
would silently drop all the options with unmet dependencies!

This is because it doesn't do the config check that `kunit.py config`
does.

So if I want to safely build a kernel for testing, I have to do
$ ./tools/testing/kunit/kunit.py config <flags>
$ ./tools/testing/kunit/kunit.py build <flags, again>

It seems unlikely that any user of kunit.py would want the current
`build` semantics.
So make it effectively do `kunit.py config` + `kunit.py build`.

Signed-off-by: Daniel Latypov <dlatypov@google.com>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit.py           | 10 +++++++++-
 tools/testing/kunit/kunit_tool_test.py |  2 +-
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py
index 417dc2d11f4f..f1be71811369 100755
--- a/tools/testing/kunit/kunit.py
+++ b/tools/testing/kunit/kunit.py
@@ -110,6 +110,14 @@ def build_tests(linux: kunit_kernel.LinuxSourceTree,
 			   'built kernel successfully',
 			   build_end - build_start)
 
+def config_and_build_tests(linux: kunit_kernel.LinuxSourceTree,
+			   request: KunitBuildRequest) -> KunitResult:
+	config_result = config_tests(linux, request)
+	if config_result.status != KunitStatus.SUCCESS:
+		return config_result
+
+	return build_tests(linux, request)
+
 def _list_tests(linux: kunit_kernel.LinuxSourceTree, request: KunitExecRequest) -> List[str]:
 	args = ['kunit.action=list']
 	if request.kernel_args:
@@ -464,7 +472,7 @@ def main(argv, linux=None):
 					    make_options=cli_args.make_options,
 					    jobs=cli_args.jobs,
 					    alltests=cli_args.alltests)
-		result = build_tests(linux, request)
+		result = config_and_build_tests(linux, request)
 		kunit_parser.print_with_timestamp((
 			'Elapsed time: %.3fs\n') % (
 				result.elapsed_time))
diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py
index 1f6b177ca5c2..b80e333a20cb 100755
--- a/tools/testing/kunit/kunit_tool_test.py
+++ b/tools/testing/kunit/kunit_tool_test.py
@@ -526,7 +526,7 @@ class KUnitMainTest(unittest.TestCase):
 
 	def test_build_passes_args_pass(self):
 		kunit.main(['build'], self.linux_source_mock)
-		self.assertEqual(self.linux_source_mock.build_reconfig.call_count, 0)
+		self.assertEqual(self.linux_source_mock.build_reconfig.call_count, 1)
 		self.linux_source_mock.build_kernel.assert_called_once_with(False, 8, '.kunit', None)
 		self.assertEqual(self.linux_source_mock.run_kernel.call_count, 0)
 
-- 
cgit v1.2.3


From 85310a62ca4e415980e30d9d07f904444197cfc4 Mon Sep 17 00:00:00 2001
From: Daniel Latypov <dlatypov@google.com>
Date: Wed, 15 Dec 2021 12:39:23 -0800
Subject: kunit: tool: fix newly introduced typechecker errors

After upgrading mypy and pytype from pip, we see 2 new errors when
running ./tools/testing/kunit/run_checks.py.

Error #1: mypy and pytype
They now deduce that importlib.util.spec_from_file_location() can return
None and note that we're not checking for this.

We validate that the arch is valid (i.e. the file exists) beforehand.
Add in an `asssert spec is not None` to appease the checkers.

Error #2: pytype bug https://github.com/google/pytype/issues/1057
It doesn't like `from datetime import datetime`, specifically that a
type shares a name with a module.

We can workaround this by either
* renaming the import or just using `import datetime`
* passing the new `--fix-module-collisions` flag to pytype.

We pick the first option for now because
* the flag is quite new, only in the 2021.11.29 release.
* I'd prefer if people can just run `pytype <file>`

Signed-off-by: Daniel Latypov <dlatypov@google.com>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit_kernel.py | 1 +
 tools/testing/kunit/kunit_parser.py | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py
index 12085e04a80c..44bbe54f25f1 100644
--- a/tools/testing/kunit/kunit_kernel.py
+++ b/tools/testing/kunit/kunit_kernel.py
@@ -209,6 +209,7 @@ def get_source_tree_ops_from_qemu_config(config_path: str,
 	# exists as a file.
 	module_path = '.' + os.path.join(os.path.basename(QEMU_CONFIGS_DIR), os.path.basename(config_path))
 	spec = importlib.util.spec_from_file_location(module_path, config_path)
+	assert spec is not None
 	config = importlib.util.module_from_spec(spec)
 	# See https://github.com/python/typeshed/pull/2626 for context.
 	assert isinstance(spec.loader, importlib.abc.Loader)
diff --git a/tools/testing/kunit/kunit_parser.py b/tools/testing/kunit/kunit_parser.py
index 66a7f2fb314a..05ff334761dd 100644
--- a/tools/testing/kunit/kunit_parser.py
+++ b/tools/testing/kunit/kunit_parser.py
@@ -12,7 +12,7 @@
 from __future__ import annotations
 import re
 
-from datetime import datetime
+import datetime
 from enum import Enum, auto
 from functools import reduce
 from typing import Iterable, Iterator, List, Optional, Tuple
@@ -517,7 +517,7 @@ ANSI_LEN = len(red(''))
 
 def print_with_timestamp(message: str) -> None:
 	"""Prints message with timestamp at beginning."""
-	print('[%s] %s' % (datetime.now().strftime('%H:%M:%S'), message))
+	print('[%s] %s' % (datetime.datetime.now().strftime('%H:%M:%S'), message))
 
 def format_test_divider(message: str, len_message: int) -> str:
 	"""
-- 
cgit v1.2.3


From ad659ccb5412874c6a89d3588cb18857c00e9d0f Mon Sep 17 00:00:00 2001
From: David Gow <davidgow@google.com>
Date: Wed, 15 Dec 2021 12:50:14 -0800
Subject: kunit: tool: Default --jobs to number of CPUs

The --jobs parameter for kunit_tool currently defaults to 8 CPUs,
regardless of the number available. For systems with significantly more
(or less), this is not as efficient. Instead, default --jobs to the
number of CPUs available to the process: while there are as many
superstitions as to exactly what the ideal jobs:CPU ratio is, this seems
sufficiently sensible to me.

A new helper function to get the default number of jobs is added:
get_default_jobs() -- this is used in kunit_tool_test instead of a
hardcoded value, or an explicit call to len(os.sched_getaffinity()), so
should be more flexible if this needs to change in the future.

Signed-off-by: David Gow <davidgow@google.com>
Signed-off-by: Daniel Latypov <dlatypov@google.com>
Reviewed-by: Daniel Latypov <dlatypov@google.com>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit.py           | 5 ++++-
 tools/testing/kunit/kunit_tool_test.py | 5 +++--
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py
index f1be71811369..7a706f96f68d 100755
--- a/tools/testing/kunit/kunit.py
+++ b/tools/testing/kunit/kunit.py
@@ -282,6 +282,9 @@ def massage_argv(argv: Sequence[str]) -> Sequence[str]:
 		return  f'{arg}={pseudo_bool_flag_defaults[arg]}'
 	return list(map(massage_arg, argv))
 
+def get_default_jobs() -> int:
+	return len(os.sched_getaffinity(0))
+
 def add_common_opts(parser) -> None:
 	parser.add_argument('--build_dir',
 			    help='As in the make command, it specifies the build '
@@ -332,7 +335,7 @@ def add_build_opts(parser) -> None:
 	parser.add_argument('--jobs',
 			    help='As in the make command, "Specifies  the number of '
 			    'jobs (commands) to run simultaneously."',
-			    type=int, default=8, metavar='jobs')
+			    type=int, default=get_default_jobs(), metavar='jobs')
 
 def add_exec_opts(parser) -> None:
 	parser.add_argument('--timeout',
diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py
index b80e333a20cb..352369dffbd9 100755
--- a/tools/testing/kunit/kunit_tool_test.py
+++ b/tools/testing/kunit/kunit_tool_test.py
@@ -527,7 +527,7 @@ class KUnitMainTest(unittest.TestCase):
 	def test_build_passes_args_pass(self):
 		kunit.main(['build'], self.linux_source_mock)
 		self.assertEqual(self.linux_source_mock.build_reconfig.call_count, 1)
-		self.linux_source_mock.build_kernel.assert_called_once_with(False, 8, '.kunit', None)
+		self.linux_source_mock.build_kernel.assert_called_once_with(False, kunit.get_default_jobs(), '.kunit', None)
 		self.assertEqual(self.linux_source_mock.run_kernel.call_count, 0)
 
 	def test_exec_passes_args_pass(self):
@@ -633,8 +633,9 @@ class KUnitMainTest(unittest.TestCase):
 
 	def test_build_builddir(self):
 		build_dir = '.kunit'
+		jobs = kunit.get_default_jobs()
 		kunit.main(['build', '--build_dir', build_dir], self.linux_source_mock)
-		self.linux_source_mock.build_kernel.assert_called_once_with(False, 8, build_dir, None)
+		self.linux_source_mock.build_kernel.assert_called_once_with(False, jobs, build_dir, None)
 
 	def test_exec_builddir(self):
 		build_dir = '.kunit'
-- 
cgit v1.2.3


From 18678591846d668649fbd4f87b4a4c470818d386 Mon Sep 17 00:00:00 2001
From: Sachin Sant <sachinp@linux.vnet.ibm.com>
Date: Mon, 13 Dec 2021 22:12:23 +0530
Subject: selftests/powerpc: skip tests for unavailable mitigations.

Mitigation patching test iterates over a set of mitigations irrespective
of whether a certain mitigation is supported/available in the kernel.
This causes following messages on a kernel where some mitigations
are unavailable:

  Spawned threads enabling/disabling mitigations ...
  cat: entry_flush: No such file or directory
  cat: uaccess_flush: No such file or directory
  Waiting for timeout ...
  OK

This patch adds a check for available mitigations in the kernel.

Reported-by: Nageswara R Sastry <rnsastry@linux.ibm.com>
Signed-off-by: Sachin Sant <sachinp@linux.vnet.ibm.com>
Tested-by: Nageswara R Sastry <rnsastry@linux.ibm.com>
Reviewed-by: Russell Currey <ruscur@russell.cc>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/163941374362.36967.18016981579099073379.sendpatchset@1.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.ip6.arpa
---
 tools/testing/selftests/powerpc/security/mitigation-patching.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/powerpc/security/mitigation-patching.sh b/tools/testing/selftests/powerpc/security/mitigation-patching.sh
index b0b20e0b4e30..f43aa4b77fba 100755
--- a/tools/testing/selftests/powerpc/security/mitigation-patching.sh
+++ b/tools/testing/selftests/powerpc/security/mitigation-patching.sh
@@ -44,7 +44,10 @@ mitigations="barrier_nospec stf_barrier count_cache_flush rfi_flush entry_flush
 
 for m in $mitigations
 do
-    do_one "$m" &
+    if [[ -f /sys/kernel/debug/powerpc/$m ]]
+    then
+        do_one "$m" &
+    fi
 done
 
 echo "Spawned threads enabling/disabling mitigations ..."
-- 
cgit v1.2.3


From c054fe9936065759b21b493ac1e7d9a2be014083 Mon Sep 17 00:00:00 2001
From: Zach DeCook <zachdecook@librem.one>
Date: Wed, 8 Dec 2021 16:48:08 -0500
Subject: iio: event_monitor: Flush output on event

By flushing the output, iio_event_monitor can be more useful to programs
chained along with it.

  iio_event_monitor stk3310 | awk '/rising/{system("my_unlockscreen.sh")} /falling/{system("my_lockscreen.sh")}'

Without this flush, the above example would buffer a number of events,
then after a while run the lock/unlock scripts several times.

Signed-off-by: Zach DeCook <zachdecook@librem.one>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 tools/iio/iio_event_monitor.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/iio/iio_event_monitor.c b/tools/iio/iio_event_monitor.c
index 0076437f6e3f..b94a16ba5c6c 100644
--- a/tools/iio/iio_event_monitor.c
+++ b/tools/iio/iio_event_monitor.c
@@ -279,6 +279,7 @@ static void print_event(struct iio_event_data *event)
 		printf(", direction: %s", iio_ev_dir_text[dir]);
 
 	printf("\n");
+	fflush(stdout);
 }
 
 /* Enable or disable events in sysfs if the knob is available */
-- 
cgit v1.2.3


From f0a29c9647ff8bbb424641f79bc1894e83dec218 Mon Sep 17 00:00:00 2001
From: Sohaib Mohamed <sohaib.amhmd@gmail.com>
Date: Fri, 19 Nov 2021 08:14:08 +0200
Subject: perf bench: Use unbuffered output when pipe/tee'ing to a file

The output of 'perf bench' gets buffered when I pipe it to a file or to
tee, in such a way that I can see it only at the end.

E.g.

  $ perf bench internals synthesize -t
  < output comes out fine after each test run >

  $ perf bench internals synthesize -t | tee file.txt
  < output comes out only at the end of all tests >

This patch resolves this issue for 'bench' and 'test' subcommands.

See, also:

  $ perf bench mem all | tee file.txt
  $ perf bench sched all | tee file.txt
  $ perf bench internals all -t | tee file.txt
  $ perf bench internals all | tee file.txt

Committer testing:

It really gets staggered, i.e. outputs in bursts, when the buffer fills
up and has to be drained to make up space for more output.

Suggested-by: Riccardo Mancini <rickyman7@gmail.com>
Signed-off-by: Sohaib Mohamed <sohaib.amhmd@gmail.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Fabian Hemmer <copy@copy.sh>
Cc: Ian Rogers <irogers@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lore.kernel.org/lkml/20211119061409.78004-1-sohaib.amhmd@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-bench.c      | 5 +++--
 tools/perf/tests/builtin-test.c | 3 +++
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c
index d0895162c2ba..d291f3a8af5f 100644
--- a/tools/perf/builtin-bench.c
+++ b/tools/perf/builtin-bench.c
@@ -226,7 +226,6 @@ static void run_collection(struct collection *coll)
 		if (!bench->fn)
 			break;
 		printf("# Running %s/%s benchmark...\n", coll->name, bench->name);
-		fflush(stdout);
 
 		argv[1] = bench->name;
 		run_bench(coll->name, bench->name, bench->fn, 1, argv);
@@ -247,6 +246,9 @@ int cmd_bench(int argc, const char **argv)
 	struct collection *coll;
 	int ret = 0;
 
+	/* Unbuffered output */
+	setvbuf(stdout, NULL, _IONBF, 0);
+
 	if (argc < 2) {
 		/* No collection specified. */
 		print_usage();
@@ -300,7 +302,6 @@ int cmd_bench(int argc, const char **argv)
 
 			if (bench_format == BENCH_FORMAT_DEFAULT)
 				printf("# Running '%s/%s' benchmark:\n", coll->name, bench->name);
-			fflush(stdout);
 			ret = run_bench(coll->name, bench->name, bench->fn, argc-1, argv+1);
 			goto end;
 		}
diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index f1e6d2a3a578..1fb9f2a11d63 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -607,6 +607,9 @@ int cmd_test(int argc, const char **argv)
         if (ret < 0)
                 return ret;
 
+	/* Unbuffered output */
+	setvbuf(stdout, NULL, _IONBF, 0);
+
 	argc = parse_options_subcommand(argc, argv, test_options, test_subcommands, test_usage, 0);
 	if (argc >= 1 && !strcmp(argv[0], "list"))
 		return perf_test__list(argc - 1, argv + 1);
-- 
cgit v1.2.3


From b0fde9c6e291e528878ea3713997777713fe44c6 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 1 Dec 2021 14:08:55 -0800
Subject: perf arm-spe: Add SPE total latency as PERF_SAMPLE_WEIGHT

Use total latency info in the SPE counter packet as sample weight so
that we can see it in local_weight and (global) weight sort keys.

Maybe we can use PERF_SAMPLE_WEIGHT_STRUCT to support ins_lat as well
but I'm not sure which latency it matches.  So just adding total latency
first.

Reviewed-by: Leo Yan <leo.yan@linaro.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: German Gomez <german.gomez@arm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lore.kernel.org/lkml/20211201220855.1260688-1-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/arm-spe-decoder/arm-spe-decoder.c | 2 ++
 tools/perf/util/arm-spe-decoder/arm-spe-decoder.h | 1 +
 tools/perf/util/arm-spe.c                         | 5 ++++-
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
index 3fc528c9270c..5e390a1a79ab 100644
--- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
+++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
@@ -179,6 +179,8 @@ static int arm_spe_read_record(struct arm_spe_decoder *decoder)
 				decoder->record.phys_addr = ip;
 			break;
 		case ARM_SPE_COUNTER:
+			if (idx == SPE_CNT_PKT_HDR_INDEX_TOTAL_LAT)
+				decoder->record.latency = payload;
 			break;
 		case ARM_SPE_CONTEXT:
 			decoder->record.context_id = payload;
diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
index 46a8556a9e95..69b31084d6be 100644
--- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
+++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
@@ -33,6 +33,7 @@ struct arm_spe_record {
 	enum arm_spe_sample_type type;
 	int err;
 	u32 op;
+	u32 latency;
 	u64 from_ip;
 	u64 to_ip;
 	u64 timestamp;
diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index fccac06b573a..8a3828f86901 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -330,6 +330,7 @@ static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq,
 	sample.addr = record->virt_addr;
 	sample.phys_addr = record->phys_addr;
 	sample.data_src = data_src;
+	sample.weight = record->latency;
 
 	return arm_spe_deliver_synth_event(spe, speq, event, &sample);
 }
@@ -347,6 +348,7 @@ static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq,
 	sample.id = spe_events_id;
 	sample.stream_id = spe_events_id;
 	sample.addr = record->to_ip;
+	sample.weight = record->latency;
 
 	return arm_spe_deliver_synth_event(spe, speq, event, &sample);
 }
@@ -993,7 +995,8 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
 	attr.type = PERF_TYPE_HARDWARE;
 	attr.sample_type = evsel->core.attr.sample_type & PERF_SAMPLE_MASK;
 	attr.sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID |
-			    PERF_SAMPLE_PERIOD | PERF_SAMPLE_DATA_SRC;
+			    PERF_SAMPLE_PERIOD | PERF_SAMPLE_DATA_SRC |
+			    PERF_SAMPLE_WEIGHT;
 	if (spe->timeless_decoding)
 		attr.sample_type &= ~(u64)PERF_SAMPLE_TIME;
 	else
-- 
cgit v1.2.3


From 888569dbcd80a0bb87371e9fb0fa3802c7aa9b14 Mon Sep 17 00:00:00 2001
From: Salvatore Bonaccorso <carnil@debian.org>
Date: Tue, 23 Nov 2021 22:18:21 +0100
Subject: perf dlfilter: Drop unused variable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Compiling tools/perf/dlfilters/dlfilter-test-api-v0.c result in:

	checking for stdlib.h... dlfilters/dlfilter-test-api-v0.c: In function ‘filter_event’:
	dlfilters/dlfilter-test-api-v0.c:311:29: warning: unused variable ‘d’ [-Wunused-variable]
	  311 |         struct filter_data *d = data;
	      |

So remove the  variable now.

Reviewed-by: German Gomez <german.gomez@arm.com>
Signed-off-by: Salvatore Bonaccorso <carnil@debian.org>
Acked-by: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20211123211821.132924-1-carnil@debian.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/dlfilters/dlfilter-test-api-v0.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/dlfilters/dlfilter-test-api-v0.c b/tools/perf/dlfilters/dlfilter-test-api-v0.c
index 7565a1852c74..b17eb52a0694 100644
--- a/tools/perf/dlfilters/dlfilter-test-api-v0.c
+++ b/tools/perf/dlfilters/dlfilter-test-api-v0.c
@@ -308,8 +308,6 @@ int filter_event_early(void *data, const struct perf_dlfilter_sample *sample, vo
 
 int filter_event(void *data, const struct perf_dlfilter_sample *sample, void *ctx)
 {
-	struct filter_data *d = data;
-
 	pr_debug("%s API\n", __func__);
 
 	return do_checks(data, sample, ctx, false);
-- 
cgit v1.2.3


From 3987d65f45ed51a4650e911baa68f9b6ed4623cb Mon Sep 17 00:00:00 2001
From: Andrew Kilroy <andrew.kilroy@arm.com>
Date: Fri, 10 Dec 2021 12:37:04 +0000
Subject: perf vendor events: For the Arm Neoverse N2

Updates the common and microarch json file to add counters available in
the Arm Neoverse N2 chip, but should also apply to other ArmV8 and ArmV9
cpus.  Specified in ArmV8 architecture reference manual

  https://developer.arm.com/documentation/ddi0487/gb/?lang=en

Some of the counters added to armv8-common-and-microarch.json are
specified in the ArmV9 architecture reference manual supplement
(issue A.a):

  https://developer.arm.com/documentation/ddi0608/aa

The additional ArmV9 counters are

  TRB_WRAP
  TRCEXTOUT0
  TRCEXTOUT1
  TRCEXTOUT2
  TRCEXTOUT3
  CTI_TRIGOUT4
  CTI_TRIGOUT5
  CTI_TRIGOUT6
  CTI_TRIGOUT7

This patch also adds files in pmu-events/arch/arm64/arm/neoverse-n2 for
perf list to output the counter names in categories.

Counters on the Neoverse N2 are stated in its reference manual:

  https://developer.arm.com/documentation/102099/0000

Reviewed-by: John Garry <john.garry@huawei.com>
Signed-off-by: Andrew Kilroy <andrew.kilroy@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20211210123706.7490-2-andrew.kilroy@arm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../arch/arm64/arm/neoverse-n2/branch.json         |   8 +
 .../pmu-events/arch/arm64/arm/neoverse-n2/bus.json |  20 +++
 .../arch/arm64/arm/neoverse-n2/cache.json          | 155 ++++++++++++++++
 .../arch/arm64/arm/neoverse-n2/exception.json      |  47 +++++
 .../arch/arm64/arm/neoverse-n2/instruction.json    | 143 +++++++++++++++
 .../arch/arm64/arm/neoverse-n2/memory.json         |  38 ++++
 .../arch/arm64/arm/neoverse-n2/other.json          |   5 +
 .../arch/arm64/arm/neoverse-n2/pipeline.json       |  23 +++
 .../pmu-events/arch/arm64/arm/neoverse-n2/spe.json |  14 ++
 .../arch/arm64/arm/neoverse-n2/trace.json          |  29 +++
 .../arch/arm64/armv8-common-and-microarch.json     | 198 +++++++++++++++++++++
 tools/perf/pmu-events/arch/arm64/mapfile.csv       |   1 +
 12 files changed, 681 insertions(+)
 create mode 100644 tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/branch.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/bus.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/cache.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/exception.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/instruction.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/memory.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/other.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/pipeline.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/spe.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/trace.json

(limited to 'tools')

diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/branch.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/branch.json
new file mode 100644
index 000000000000..79f2016c53b0
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/branch.json
@@ -0,0 +1,8 @@
+[
+    {
+        "ArchStdEvent": "BR_MIS_PRED"
+    },
+    {
+        "ArchStdEvent": "BR_PRED"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/bus.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/bus.json
new file mode 100644
index 000000000000..579c1c993d17
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/bus.json
@@ -0,0 +1,20 @@
+[
+    {
+        "ArchStdEvent": "CPU_CYCLES"
+    },
+    {
+        "ArchStdEvent": "BUS_ACCESS"
+    },
+    {
+        "ArchStdEvent": "BUS_CYCLES"
+    },
+    {
+        "ArchStdEvent": "BUS_ACCESS_RD"
+    },
+    {
+        "ArchStdEvent": "BUS_ACCESS_WR"
+    },
+    {
+        "ArchStdEvent": "CNT_CYCLES"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/cache.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/cache.json
new file mode 100644
index 000000000000..0141f749bff3
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/cache.json
@@ -0,0 +1,155 @@
+[
+    {
+        "ArchStdEvent": "L1I_CACHE_REFILL"
+    },
+    {
+        "ArchStdEvent": "L1I_TLB_REFILL"
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_REFILL"
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE"
+    },
+    {
+        "ArchStdEvent": "L1D_TLB_REFILL"
+    },
+    {
+        "ArchStdEvent": "L1I_CACHE"
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_WB"
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE"
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_REFILL"
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_WB"
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_ALLOCATE"
+    },
+    {
+        "ArchStdEvent": "L1D_TLB"
+    },
+    {
+        "ArchStdEvent": "L1I_TLB"
+    },
+    {
+        "ArchStdEvent": "L3D_CACHE_ALLOCATE"
+    },
+    {
+        "ArchStdEvent": "L3D_CACHE_REFILL"
+    },
+    {
+        "ArchStdEvent": "L3D_CACHE"
+    },
+    {
+        "ArchStdEvent": "L2D_TLB_REFILL"
+    },
+    {
+        "ArchStdEvent": "L2D_TLB"
+    },
+    {
+        "ArchStdEvent": "DTLB_WALK"
+    },
+    {
+        "ArchStdEvent": "ITLB_WALK"
+    },
+    {
+        "ArchStdEvent": "LL_CACHE_RD"
+    },
+    {
+        "ArchStdEvent": "LL_CACHE_MISS_RD"
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_LMISS_RD"
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_RD"
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_WR"
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_REFILL_RD"
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_REFILL_WR"
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_REFILL_INNER"
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_REFILL_OUTER"
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_WB_VICTIM"
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_WB_CLEAN"
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_INVAL"
+    },
+    {
+        "ArchStdEvent": "L1D_TLB_REFILL_RD"
+    },
+    {
+        "ArchStdEvent": "L1D_TLB_REFILL_WR"
+    },
+    {
+        "ArchStdEvent": "L1D_TLB_RD"
+    },
+    {
+        "ArchStdEvent": "L1D_TLB_WR"
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_RD"
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_WR"
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_REFILL_RD"
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_REFILL_WR"
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_WB_VICTIM"
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_WB_CLEAN"
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_INVAL"
+    },
+    {
+        "ArchStdEvent": "L2D_TLB_REFILL_RD"
+    },
+    {
+        "ArchStdEvent": "L2D_TLB_REFILL_WR"
+    },
+    {
+        "ArchStdEvent": "L2D_TLB_RD"
+    },
+    {
+        "ArchStdEvent": "L2D_TLB_WR"
+    },
+    {
+        "ArchStdEvent": "L3D_CACHE_RD"
+    },
+    {
+        "ArchStdEvent": "L1I_CACHE_LMISS"
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_LMISS_RD"
+    },
+    {
+        "ArchStdEvent": "L3D_CACHE_LMISS_RD"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/exception.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/exception.json
new file mode 100644
index 000000000000..344a2d552ad5
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/exception.json
@@ -0,0 +1,47 @@
+[
+    {
+        "ArchStdEvent": "EXC_TAKEN"
+    },
+    {
+        "ArchStdEvent": "MEMORY_ERROR"
+    },
+    {
+        "ArchStdEvent": "EXC_UNDEF"
+    },
+    {
+        "ArchStdEvent": "EXC_SVC"
+    },
+    {
+        "ArchStdEvent": "EXC_PABORT"
+    },
+    {
+        "ArchStdEvent": "EXC_DABORT"
+    },
+    {
+        "ArchStdEvent": "EXC_IRQ"
+    },
+    {
+        "ArchStdEvent": "EXC_FIQ"
+    },
+    {
+        "ArchStdEvent": "EXC_SMC"
+    },
+    {
+        "ArchStdEvent": "EXC_HVC"
+    },
+    {
+        "ArchStdEvent": "EXC_TRAP_PABORT"
+    },
+    {
+        "ArchStdEvent": "EXC_TRAP_DABORT"
+    },
+    {
+        "ArchStdEvent": "EXC_TRAP_OTHER"
+    },
+    {
+        "ArchStdEvent": "EXC_TRAP_IRQ"
+    },
+    {
+        "ArchStdEvent": "EXC_TRAP_FIQ"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/instruction.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/instruction.json
new file mode 100644
index 000000000000..e57cd55937c6
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/instruction.json
@@ -0,0 +1,143 @@
+[
+    {
+        "ArchStdEvent": "SW_INCR"
+    },
+    {
+        "ArchStdEvent": "INST_RETIRED"
+    },
+    {
+        "ArchStdEvent": "EXC_RETURN"
+    },
+    {
+        "ArchStdEvent": "CID_WRITE_RETIRED"
+    },
+    {
+        "ArchStdEvent": "INST_SPEC"
+    },
+    {
+        "ArchStdEvent": "TTBR_WRITE_RETIRED"
+    },
+    {
+        "ArchStdEvent": "BR_RETIRED"
+    },
+    {
+        "ArchStdEvent": "BR_MIS_PRED_RETIRED"
+    },
+    {
+        "ArchStdEvent": "OP_RETIRED"
+    },
+    {
+        "ArchStdEvent": "OP_SPEC"
+    },
+    {
+        "ArchStdEvent": "LDREX_SPEC"
+    },
+    {
+        "ArchStdEvent": "STREX_PASS_SPEC"
+    },
+    {
+        "ArchStdEvent": "STREX_FAIL_SPEC"
+    },
+    {
+        "ArchStdEvent": "STREX_SPEC"
+    },
+    {
+        "ArchStdEvent": "LD_SPEC"
+    },
+    {
+        "ArchStdEvent": "ST_SPEC"
+    },
+    {
+        "ArchStdEvent": "DP_SPEC"
+    },
+    {
+        "ArchStdEvent": "ASE_SPEC"
+    },
+    {
+        "ArchStdEvent": "VFP_SPEC"
+    },
+    {
+        "ArchStdEvent": "PC_WRITE_SPEC"
+    },
+    {
+        "ArchStdEvent": "CRYPTO_SPEC"
+    },
+    {
+        "ArchStdEvent": "BR_IMMED_SPEC"
+    },
+    {
+        "ArchStdEvent": "BR_RETURN_SPEC"
+    },
+    {
+        "ArchStdEvent": "BR_INDIRECT_SPEC"
+    },
+    {
+        "ArchStdEvent": "ISB_SPEC"
+    },
+    {
+        "ArchStdEvent": "DSB_SPEC"
+    },
+    {
+        "ArchStdEvent": "DMB_SPEC"
+    },
+    {
+        "ArchStdEvent": "RC_LD_SPEC"
+    },
+    {
+        "ArchStdEvent": "RC_ST_SPEC"
+    },
+    {
+        "ArchStdEvent": "ASE_INST_SPEC"
+    },
+    {
+        "ArchStdEvent": "SVE_INST_SPEC"
+    },
+    {
+        "ArchStdEvent": "FP_HP_SPEC"
+    },
+    {
+        "ArchStdEvent": "FP_SP_SPEC"
+    },
+    {
+        "ArchStdEvent": "FP_DP_SPEC"
+    },
+    {
+        "ArchStdEvent": "SVE_PRED_SPEC"
+    },
+    {
+        "ArchStdEvent": "SVE_PRED_EMPTY_SPEC"
+    },
+    {
+        "ArchStdEvent": "SVE_PRED_FULL_SPEC"
+    },
+    {
+        "ArchStdEvent": "SVE_PRED_PARTIAL_SPEC"
+    },
+    {
+        "ArchStdEvent": "SVE_PRED_NOT_FULL_SPEC"
+    },
+    {
+        "ArchStdEvent": "SVE_LDFF_SPEC"
+    },
+    {
+        "ArchStdEvent": "SVE_LDFF_FAULT_SPEC"
+    },
+    {
+        "ArchStdEvent": "FP_SCALE_OPS_SPEC"
+    },
+    {
+        "ArchStdEvent": "FP_FIXED_OPS_SPEC"
+    },
+    {
+        "ArchStdEvent": "ASE_SVE_INT8_SPEC"
+    },
+    {
+        "ArchStdEvent": "ASE_SVE_INT16_SPEC"
+    },
+    {
+        "ArchStdEvent": "ASE_SVE_INT32_SPEC"
+    },
+    {
+        "ArchStdEvent": "ASE_SVE_INT64_SPEC"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/memory.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/memory.json
new file mode 100644
index 000000000000..e522113aeb96
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/memory.json
@@ -0,0 +1,38 @@
+[
+    {
+        "ArchStdEvent": "MEM_ACCESS"
+    },
+    {
+        "ArchStdEvent": "MEM_ACCESS_RD"
+    },
+    {
+        "ArchStdEvent": "MEM_ACCESS_WR"
+    },
+    {
+        "ArchStdEvent": "UNALIGNED_LD_SPEC"
+    },
+    {
+        "ArchStdEvent": "UNALIGNED_ST_SPEC"
+    },
+    {
+        "ArchStdEvent": "UNALIGNED_LDST_SPEC"
+    },
+    {
+        "ArchStdEvent": "LDST_ALIGN_LAT"
+    },
+    {
+        "ArchStdEvent": "LD_ALIGN_LAT"
+    },
+    {
+        "ArchStdEvent": "ST_ALIGN_LAT"
+    },
+    {
+        "ArchStdEvent": "MEM_ACCESS_CHECKED"
+    },
+    {
+        "ArchStdEvent": "MEM_ACCESS_CHECKED_RD"
+    },
+    {
+        "ArchStdEvent": "MEM_ACCESS_CHECKED_WR"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/other.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/other.json
new file mode 100644
index 000000000000..20d8365756c5
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/other.json
@@ -0,0 +1,5 @@
+[
+    {
+        "ArchStdEvent": "REMOTE_ACCESS"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/pipeline.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/pipeline.json
new file mode 100644
index 000000000000..f9fae15f7555
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/pipeline.json
@@ -0,0 +1,23 @@
+[
+    {
+        "ArchStdEvent": "STALL_FRONTEND"
+    },
+    {
+        "ArchStdEvent": "STALL_BACKEND"
+    },
+    {
+        "ArchStdEvent": "STALL"
+    },
+    {
+        "ArchStdEvent": "STALL_SLOT_BACKEND"
+    },
+    {
+        "ArchStdEvent": "STALL_SLOT_FRONTEND"
+    },
+    {
+        "ArchStdEvent": "STALL_SLOT"
+    },
+    {
+        "ArchStdEvent": "STALL_BACKEND_MEM"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/spe.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/spe.json
new file mode 100644
index 000000000000..20f2165c85fe
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/spe.json
@@ -0,0 +1,14 @@
+[
+    {
+        "ArchStdEvent": "SAMPLE_POP"
+    },
+    {
+        "ArchStdEvent": "SAMPLE_FEED"
+    },
+    {
+        "ArchStdEvent": "SAMPLE_FILTRATE"
+    },
+    {
+        "ArchStdEvent": "SAMPLE_COLLISION"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/trace.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/trace.json
new file mode 100644
index 000000000000..3116135c59e2
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/trace.json
@@ -0,0 +1,29 @@
+[
+    {
+        "ArchStdEvent": "TRB_WRAP"
+    },
+    {
+        "ArchStdEvent": "TRCEXTOUT0"
+    },
+    {
+        "ArchStdEvent": "TRCEXTOUT1"
+    },
+    {
+        "ArchStdEvent": "TRCEXTOUT2"
+    },
+    {
+        "ArchStdEvent": "TRCEXTOUT3"
+    },
+    {
+        "ArchStdEvent": "CTI_TRIGOUT4"
+    },
+    {
+        "ArchStdEvent": "CTI_TRIGOUT5"
+    },
+    {
+        "ArchStdEvent": "CTI_TRIGOUT6"
+    },
+    {
+        "ArchStdEvent": "CTI_TRIGOUT7"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/armv8-common-and-microarch.json b/tools/perf/pmu-events/arch/arm64/armv8-common-and-microarch.json
index 423767510aff..80d7a70829a0 100644
--- a/tools/perf/pmu-events/arch/arm64/armv8-common-and-microarch.json
+++ b/tools/perf/pmu-events/arch/arm64/armv8-common-and-microarch.json
@@ -299,6 +299,30 @@
         "EventName": "STALL_SLOT",
         "BriefDescription": "No operation sent for execution on a slot"
     },
+    {
+        "PublicDescription": "Sample Population",
+        "EventCode": "0x4000",
+        "EventName": "SAMPLE_POP",
+        "BriefDescription": "Sample Population"
+    },
+    {
+        "PublicDescription": "Sample Taken",
+        "EventCode": "0x4001",
+        "EventName": "SAMPLE_FEED",
+        "BriefDescription": "Sample Taken"
+    },
+    {
+        "PublicDescription": "Sample Taken and not removed by filtering",
+        "EventCode": "0x4002",
+        "EventName": "SAMPLE_FILTRATE",
+        "BriefDescription": "Sample Taken and not removed by filtering"
+    },
+    {
+        "PublicDescription": "Sample collided with previous sample",
+        "EventCode": "0x4003",
+        "EventName": "SAMPLE_COLLISION",
+        "BriefDescription": "Sample collided with previous sample"
+    },
     {
         "PublicDescription": "Constant frequency cycles.  The counter increments at a constant frequency equal to the rate of increment of the system counter, CNTPCT_EL0.",
         "EventCode": "0x4004",
@@ -329,6 +353,96 @@
         "EventName": "L3D_CACHE_LMISS_RD",
         "BriefDescription": "Level 3 data cache long-latency read miss"
     },
+    {
+        "PublicDescription": "Trace buffer current write pointer wrapped",
+        "EventCode": "0x400C",
+        "EventName": "TRB_WRAP",
+        "BriefDescription": "Trace buffer current write pointer wrapped"
+    },
+    {
+        "PublicDescription": "PE Trace Unit external output 0",
+        "EventCode": "0x4010",
+        "EventName": "TRCEXTOUT0",
+        "BriefDescription": "PE Trace Unit external output 0"
+    },
+    {
+        "PublicDescription": "PE Trace Unit external output 1",
+        "EventCode": "0x4011",
+        "EventName": "TRCEXTOUT1",
+        "BriefDescription": "PE Trace Unit external output 1"
+    },
+    {
+        "PublicDescription": "PE Trace Unit external output 2",
+        "EventCode": "0x4012",
+        "EventName": "TRCEXTOUT2",
+        "BriefDescription": "PE Trace Unit external output 2"
+    },
+    {
+        "PublicDescription": "PE Trace Unit external output 3",
+        "EventCode": "0x4013",
+        "EventName": "TRCEXTOUT3",
+        "BriefDescription": "PE Trace Unit external output 3"
+    },
+    {
+        "PublicDescription": "Cross-trigger Interface output trigger 4",
+        "EventCode": "0x4018",
+        "EventName": "CTI_TRIGOUT4",
+        "BriefDescription": "Cross-trigger Interface output trigger 4"
+    },
+    {
+        "PublicDescription": "Cross-trigger Interface output trigger 5 ",
+        "EventCode": "0x4019",
+        "EventName": "CTI_TRIGOUT5",
+        "BriefDescription": "Cross-trigger Interface output trigger 5 "
+    },
+    {
+        "PublicDescription": "Cross-trigger Interface output trigger 6",
+        "EventCode": "0x401A",
+        "EventName": "CTI_TRIGOUT6",
+        "BriefDescription": "Cross-trigger Interface output trigger 6"
+    },
+    {
+        "PublicDescription": "Cross-trigger Interface output trigger 7",
+        "EventCode": "0x401B",
+        "EventName": "CTI_TRIGOUT7",
+        "BriefDescription": "Cross-trigger Interface output trigger 7"
+    },
+    {
+        "PublicDescription": "Access with additional latency from alignment",
+        "EventCode": "0x4020",
+        "EventName": "LDST_ALIGN_LAT",
+        "BriefDescription": "Access with additional latency from alignment"
+    },
+    {
+        "PublicDescription": "Load with additional latency from alignment",
+        "EventCode": "0x4021",
+        "EventName": "LD_ALIGN_LAT",
+        "BriefDescription": "Load with additional latency from alignment"
+    },
+    {
+        "PublicDescription": "Store with additional latency from alignment",
+        "EventCode": "0x4022",
+        "EventName": "ST_ALIGN_LAT",
+        "BriefDescription": "Store with additional latency from alignment"
+    },
+    {
+        "PublicDescription": "Checked data memory access",
+        "EventCode": "0x4024",
+        "EventName": "MEM_ACCESS_CHECKED",
+        "BriefDescription": "Checked data memory access"
+    },
+    {
+        "PublicDescription": "Checked data memory access, read",
+        "EventCode": "0x4025",
+        "EventName": "MEM_ACCESS_CHECKED_RD",
+        "BriefDescription": "Checked data memory access, read"
+    },
+    {
+        "PublicDescription": "Checked data memory access, write",
+        "EventCode": "0x4026",
+        "EventName": "MEM_ACCESS_CHECKED_WR",
+        "BriefDescription": "Checked data memory access, write"
+    },
     {
         "PublicDescription": "SIMD Instruction architecturally executed.",
         "EventCode": "0x8000",
@@ -341,6 +455,18 @@
         "EventName": "SVE_INST_RETIRED",
         "BriefDescription": "Instruction architecturally executed, SVE."
     },
+    {
+        "PublicDescription": "ASE operations speculatively executed",
+        "EventCode": "0x8005",
+        "EventName": "ASE_INST_SPEC",
+        "BriefDescription": "ASE operations speculatively executed"
+    },
+    {
+        "PublicDescription": "SVE operations speculatively executed",
+        "EventCode": "0x8006",
+        "EventName": "SVE_INST_SPEC",
+        "BriefDescription": "SVE operations speculatively executed"
+    },
     {
         "PublicDescription": "Microarchitectural operation, Operations speculatively executed.",
         "EventCode": "0x8008",
@@ -359,6 +485,24 @@
         "EventName": "FP_SPEC",
         "BriefDescription": "Floating-point Operations speculatively executed."
     },
+    {
+        "PublicDescription": "Floating-point half-precision operations speculatively executed",
+        "EventCode": "0x8014",
+        "EventName": "FP_HP_SPEC",
+        "BriefDescription": "Floating-point half-precision operations speculatively executed"
+    },
+    {
+        "PublicDescription": "Floating-point single-precision operations speculatively executed",
+        "EventCode": "0x8018",
+        "EventName": "FP_SP_SPEC",
+        "BriefDescription": "Floating-point single-precision operations speculatively executed"
+    },
+    {
+        "PublicDescription": "Floating-point double-precision operations speculatively executed",
+        "EventCode": "0x801C",
+        "EventName": "FP_DP_SPEC",
+        "BriefDescription": "Floating-point double-precision operations speculatively executed"
+    },
     {
         "PublicDescription": "Floating-point FMA Operations speculatively executed.",
         "EventCode": "0x8028",
@@ -389,6 +533,30 @@
         "EventName": "SVE_PRED_SPEC",
         "BriefDescription": "SVE predicated Operations speculatively executed."
     },
+    {
+        "PublicDescription": "SVE predicated operations with no active predicates speculatively executed",
+        "EventCode": "0x8075",
+        "EventName": "SVE_PRED_EMPTY_SPEC",
+        "BriefDescription": "SVE predicated operations with no active predicates speculatively executed"
+    },
+    {
+        "PublicDescription": "SVE predicated operations speculatively executed with all active predicates",
+        "EventCode": "0x8076",
+        "EventName": "SVE_PRED_FULL_SPEC",
+        "BriefDescription": "SVE predicated operations speculatively executed with all active predicates"
+    },
+    {
+        "PublicDescription": "SVE predicated operations speculatively executed with partially active predicates",
+        "EventCode": "0x8077",
+        "EventName": "SVE_PRED_PARTIAL_SPEC",
+        "BriefDescription": "SVE predicated operations speculatively executed with partially active predicates"
+    },
+    {
+        "PublicDescription": "SVE predicated operations with empty or partially active predicates",
+        "EventCode": "0x8079",
+        "EventName": "SVE_PRED_NOT_FULL_SPEC",
+        "BriefDescription": "SVE predicated operations with empty or partially active predicates"
+    },
     {
         "PublicDescription": "SVE MOVPRFX Operations speculatively executed.",
         "EventCode": "0x807C",
@@ -497,6 +665,12 @@
         "EventName": "SVE_LDFF_SPEC",
         "BriefDescription": "SVE First-fault load Operations speculatively executed."
     },
+    {
+        "PublicDescription": "SVE first-fault load operations speculatively executed which set FFR bit to 0",
+        "EventCode": "0x80BD",
+        "EventName": "SVE_LDFF_FAULT_SPEC",
+        "BriefDescription": "SVE first-fault load operations speculatively executed which set FFR bit to 0"
+    },
     {
         "PublicDescription": "Scalable floating-point element Operations speculatively executed.",
         "EventCode": "0x80C0",
@@ -544,5 +718,29 @@
         "EventCode": "0x80C7",
         "EventName": "FP_DP_FIXED_OPS_SPEC",
         "BriefDescription": "Non-scalable double-precision floating-point element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Advanced SIMD and SVE 8-bit integer operations speculatively executed",
+        "EventCode": "0x80E3",
+        "EventName": "ASE_SVE_INT8_SPEC",
+        "BriefDescription": "Advanced SIMD and SVE 8-bit integer operations speculatively executed"
+    },
+    {
+        "PublicDescription": "Advanced SIMD and SVE 16-bit integer operations speculatively executed",
+        "EventCode": "0x80E7",
+        "EventName": "ASE_SVE_INT16_SPEC",
+        "BriefDescription": "Advanced SIMD and SVE 16-bit integer operations speculatively executed"
+    },
+    {
+        "PublicDescription": "Advanced SIMD and SVE 32-bit integer operations speculatively executed",
+        "EventCode": "0x80EB",
+        "EventName": "ASE_SVE_INT32_SPEC",
+        "BriefDescription": "Advanced SIMD and SVE 32-bit integer operations speculatively executed"
+    },
+    {
+        "PublicDescription": "Advanced SIMD and SVE 64-bit integer operations speculatively executed",
+        "EventCode": "0x80EF",
+        "EventName": "ASE_SVE_INT64_SPEC",
+        "BriefDescription": "Advanced SIMD and SVE 64-bit integer operations speculatively executed"
     }
 ]
diff --git a/tools/perf/pmu-events/arch/arm64/mapfile.csv b/tools/perf/pmu-events/arch/arm64/mapfile.csv
index 31d8b57ca9bb..b899db48c12a 100644
--- a/tools/perf/pmu-events/arch/arm64/mapfile.csv
+++ b/tools/perf/pmu-events/arch/arm64/mapfile.csv
@@ -19,6 +19,7 @@
 0x00000000410fd0b0,v1,arm/cortex-a76-n1,core
 0x00000000410fd0c0,v1,arm/cortex-a76-n1,core
 0x00000000410fd400,v1,arm/neoverse-v1,core
+0x00000000410fd490,v1,arm/neoverse-n2,core
 0x00000000420f5160,v1,cavium/thunderx2,core
 0x00000000430f0af0,v1,cavium/thunderx2,core
 0x00000000460f0010,v1,fujitsu/a64fx,core
-- 
cgit v1.2.3


From 6732f10b11c63112ce34a064b247c00e090b9514 Mon Sep 17 00:00:00 2001
From: Andrew Kilroy <andrew.kilroy@arm.com>
Date: Fri, 10 Dec 2021 12:37:05 +0000
Subject: perf vendor events: Rename arm64 arch std event files

A previous commit adds pmu events into the files

  armv8-common-and-microarch.json
  armv8-recommended.json

that are actually specified in an armv9 reference supplement, not armv8.
As such, naming the files with the armv8 prefix seems artificial.

This patch renames the files to reflect that these two files are for
arch std events regardless of whether they are defined in armv8 or
armv9.

Reviewed-by: John Garry <john.garry@huawei.com>
Signed-off-by: Andrew Kilroy <andrew.kilroy@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20211210123706.7490-3-andrew.kilroy@arm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../arch/arm64/armv8-common-and-microarch.json     | 746 ---------------------
 .../pmu-events/arch/arm64/armv8-recommended.json   | 452 -------------
 .../arch/arm64/common-and-microarch.json           | 746 +++++++++++++++++++++
 tools/perf/pmu-events/arch/arm64/recommended.json  | 452 +++++++++++++
 4 files changed, 1198 insertions(+), 1198 deletions(-)
 delete mode 100644 tools/perf/pmu-events/arch/arm64/armv8-common-and-microarch.json
 delete mode 100644 tools/perf/pmu-events/arch/arm64/armv8-recommended.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/common-and-microarch.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/recommended.json

(limited to 'tools')

diff --git a/tools/perf/pmu-events/arch/arm64/armv8-common-and-microarch.json b/tools/perf/pmu-events/arch/arm64/armv8-common-and-microarch.json
deleted file mode 100644
index 80d7a70829a0..000000000000
--- a/tools/perf/pmu-events/arch/arm64/armv8-common-and-microarch.json
+++ /dev/null
@@ -1,746 +0,0 @@
-[
-    {
-        "PublicDescription": "Instruction architecturally executed, Condition code check pass, software increment",
-        "EventCode": "0x00",
-        "EventName": "SW_INCR",
-        "BriefDescription": "Instruction architecturally executed, Condition code check pass, software increment"
-    },
-    {
-        "PublicDescription": "Level 1 instruction cache refill",
-        "EventCode": "0x01",
-        "EventName": "L1I_CACHE_REFILL",
-        "BriefDescription": "Level 1 instruction cache refill"
-    },
-    {
-        "PublicDescription": "Attributable Level 1 instruction TLB refill",
-        "EventCode": "0x02",
-        "EventName": "L1I_TLB_REFILL",
-        "BriefDescription": "Attributable Level 1 instruction TLB refill"
-    },
-    {
-        "PublicDescription": "Level 1 data cache refill",
-        "EventCode": "0x03",
-        "EventName": "L1D_CACHE_REFILL",
-        "BriefDescription": "Level 1 data cache refill"
-    },
-    {
-        "PublicDescription": "Level 1 data cache access",
-        "EventCode": "0x04",
-        "EventName": "L1D_CACHE",
-        "BriefDescription": "Level 1 data cache access"
-    },
-    {
-        "PublicDescription": "Attributable Level 1 data TLB refill",
-        "EventCode": "0x05",
-        "EventName": "L1D_TLB_REFILL",
-        "BriefDescription": "Attributable Level 1 data TLB refill"
-    },
-    {
-        "PublicDescription": "Instruction architecturally executed",
-        "EventCode": "0x08",
-        "EventName": "INST_RETIRED",
-        "BriefDescription": "Instruction architecturally executed"
-    },
-    {
-        "PublicDescription": "Exception taken",
-        "EventCode": "0x09",
-        "EventName": "EXC_TAKEN",
-        "BriefDescription": "Exception taken"
-    },
-    {
-        "PublicDescription": "Instruction architecturally executed, condition check pass, exception return",
-        "EventCode": "0x0a",
-        "EventName": "EXC_RETURN",
-        "BriefDescription": "Instruction architecturally executed, condition check pass, exception return"
-    },
-    {
-        "PublicDescription": "Instruction architecturally executed, condition code check pass, write to CONTEXTIDR",
-        "EventCode": "0x0b",
-        "EventName": "CID_WRITE_RETIRED",
-        "BriefDescription": "Instruction architecturally executed, condition code check pass, write to CONTEXTIDR"
-    },
-    {
-        "PublicDescription": "Mispredicted or not predicted branch speculatively executed",
-        "EventCode": "0x10",
-        "EventName": "BR_MIS_PRED",
-        "BriefDescription": "Mispredicted or not predicted branch speculatively executed"
-    },
-    {
-        "PublicDescription": "Cycle",
-        "EventCode": "0x11",
-        "EventName": "CPU_CYCLES",
-        "BriefDescription": "Cycle"
-    },
-    {
-        "PublicDescription": "Predictable branch speculatively executed",
-        "EventCode": "0x12",
-        "EventName": "BR_PRED",
-        "BriefDescription": "Predictable branch speculatively executed"
-    },
-    {
-        "PublicDescription": "Data memory access",
-        "EventCode": "0x13",
-        "EventName": "MEM_ACCESS",
-        "BriefDescription": "Data memory access"
-    },
-    {
-        "PublicDescription": "Attributable Level 1 instruction cache access",
-        "EventCode": "0x14",
-        "EventName": "L1I_CACHE",
-        "BriefDescription": "Attributable Level 1 instruction cache access"
-    },
-    {
-        "PublicDescription": "Attributable Level 1 data cache write-back",
-        "EventCode": "0x15",
-        "EventName": "L1D_CACHE_WB",
-        "BriefDescription": "Attributable Level 1 data cache write-back"
-    },
-    {
-        "PublicDescription": "Level 2 data cache access",
-        "EventCode": "0x16",
-        "EventName": "L2D_CACHE",
-        "BriefDescription": "Level 2 data cache access"
-    },
-    {
-        "PublicDescription": "Level 2 data refill",
-        "EventCode": "0x17",
-        "EventName": "L2D_CACHE_REFILL",
-        "BriefDescription": "Level 2 data refill"
-    },
-    {
-        "PublicDescription": "Attributable Level 2 data cache write-back",
-        "EventCode": "0x18",
-        "EventName": "L2D_CACHE_WB",
-        "BriefDescription": "Attributable Level 2 data cache write-back"
-    },
-    {
-        "PublicDescription": "Attributable Bus access",
-        "EventCode": "0x19",
-        "EventName": "BUS_ACCESS",
-        "BriefDescription": "Attributable Bus access"
-    },
-    {
-        "PublicDescription": "Local memory error",
-        "EventCode": "0x1a",
-        "EventName": "MEMORY_ERROR",
-        "BriefDescription": "Local memory error"
-    },
-    {
-        "PublicDescription": "Operation speculatively executed",
-        "EventCode": "0x1b",
-        "EventName": "INST_SPEC",
-        "BriefDescription": "Operation speculatively executed"
-    },
-    {
-        "PublicDescription": "Instruction architecturally executed, Condition code check pass, write to TTBR",
-        "EventCode": "0x1c",
-        "EventName": "TTBR_WRITE_RETIRED",
-        "BriefDescription": "Instruction architecturally executed, Condition code check pass, write to TTBR"
-    },
-    {
-        "PublicDescription": "Bus cycle",
-        "EventCode": "0x1D",
-        "EventName": "BUS_CYCLES",
-        "BriefDescription": "Bus cycle"
-    },
-    {
-        "PublicDescription": "Attributable Level 2 data cache allocation without refill",
-        "EventCode": "0x20",
-        "EventName": "L2D_CACHE_ALLOCATE",
-        "BriefDescription": "Attributable Level 2 data cache allocation without refill"
-    },
-    {
-        "PublicDescription": "Instruction architecturally executed, branch",
-        "EventCode": "0x21",
-        "EventName": "BR_RETIRED",
-        "BriefDescription": "Instruction architecturally executed, branch"
-    },
-    {
-        "PublicDescription": "Instruction architecturally executed, mispredicted branch",
-        "EventCode": "0x22",
-        "EventName": "BR_MIS_PRED_RETIRED",
-        "BriefDescription": "Instruction architecturally executed, mispredicted branch"
-    },
-    {
-        "PublicDescription": "No operation issued because of the frontend",
-        "EventCode": "0x23",
-        "EventName": "STALL_FRONTEND",
-        "BriefDescription": "No operation issued because of the frontend"
-    },
-    {
-        "PublicDescription": "No operation issued due to the backend",
-        "EventCode": "0x24",
-        "EventName": "STALL_BACKEND",
-        "BriefDescription": "No operation issued due to the backend"
-    },
-    {
-        "PublicDescription": "Attributable Level 1 data or unified TLB access",
-        "EventCode": "0x25",
-        "EventName": "L1D_TLB",
-        "BriefDescription": "Attributable Level 1 data or unified TLB access"
-    },
-    {
-        "PublicDescription": "Attributable Level 1 instruction TLB access",
-        "EventCode": "0x26",
-        "EventName": "L1I_TLB",
-        "BriefDescription": "Attributable Level 1 instruction TLB access"
-    },
-    {
-        "PublicDescription": "Attributable Level 3 data cache allocation without refill",
-        "EventCode": "0x29",
-        "EventName": "L3D_CACHE_ALLOCATE",
-        "BriefDescription": "Attributable Level 3 data cache allocation without refill"
-    },
-    {
-        "PublicDescription": "Attributable Level 3 data cache refill",
-        "EventCode": "0x2A",
-        "EventName": "L3D_CACHE_REFILL",
-        "BriefDescription": "Attributable Level 3 data cache refill"
-    },
-    {
-        "PublicDescription": "Attributable Level 3 data cache access",
-        "EventCode": "0x2B",
-        "EventName": "L3D_CACHE",
-        "BriefDescription": "Attributable Level 3 data cache access"
-    },
-    {
-        "PublicDescription": "Attributable Level 2 data TLB refill",
-        "EventCode": "0x2D",
-        "EventName": "L2D_TLB_REFILL",
-        "BriefDescription": "Attributable Level 2 data TLB refill"
-    },
-    {
-        "PublicDescription": "Attributable Level 2 instruction TLB refill.",
-        "EventCode": "0x2E",
-        "EventName": "L2I_TLB_REFILL",
-        "BriefDescription": "Attributable Level 2 instruction TLB refill."
-    },
-    {
-        "PublicDescription": "Attributable Level 2 data or unified TLB access",
-        "EventCode": "0x2F",
-        "EventName": "L2D_TLB",
-        "BriefDescription": "Attributable Level 2 data or unified TLB access"
-    },
-    {
-        "PublicDescription": "Attributable Level 2 instruction TLB access.",
-        "EventCode": "0x30",
-        "EventName": "L2I_TLB",
-        "BriefDescription": "Attributable Level 2 instruction TLB access."
-    },
-    {
-        "PublicDescription": "Access to another socket in a multi-socket system",
-        "EventCode": "0x31",
-        "EventName": "REMOTE_ACCESS",
-        "BriefDescription": "Access to another socket in a multi-socket system"
-    },
-    {
-        "PublicDescription": "Access to data TLB causes a translation table walk",
-        "EventCode": "0x34",
-        "EventName": "DTLB_WALK",
-        "BriefDescription": "Access to data TLB causes a translation table walk"
-    },
-    {
-        "PublicDescription": "Access to instruction TLB that causes a translation table walk",
-        "EventCode": "0x35",
-        "EventName": "ITLB_WALK",
-        "BriefDescription": "Access to instruction TLB that causes a translation table walk"
-    },
-    {
-        "PublicDescription": "Attributable Last level cache memory read",
-        "EventCode": "0x36",
-        "EventName": "LL_CACHE_RD",
-        "BriefDescription": "Attributable Last level cache memory read"
-    },
-    {
-        "PublicDescription": "Last level cache miss, read",
-        "EventCode": "0x37",
-        "EventName": "LL_CACHE_MISS_RD",
-        "BriefDescription": "Last level cache miss, read"
-    },
-    {
-        "PublicDescription": "Level 1 data cache long-latency read miss.  The counter counts each memory read access counted by L1D_CACHE that incurs additional latency because it returns data from outside the Level 1 data or unified cache of this processing element.",
-        "EventCode": "0x39",
-        "EventName": "L1D_CACHE_LMISS_RD",
-        "BriefDescription": "Level 1 data cache long-latency read miss"
-    },
-    {
-        "PublicDescription": "Micro-operation architecturally executed.  The counter counts each operation counted by OP_SPEC that would be executed in a simple sequential execution of the program.",
-        "EventCode": "0x3A",
-        "EventName": "OP_RETIRED",
-        "BriefDescription": "Micro-operation architecturally executed"
-    },
-    {
-        "PublicDescription": "Micro-operation speculatively executed.  The counter counts the number of operations executed by the processing element, including those that are executed speculatively and would not be executed in a simple sequential execution of the program.",
-        "EventCode": "0x3B",
-        "EventName": "OP_SPEC",
-        "BriefDescription": "Micro-operation speculatively executed"
-    },
-    {
-        "PublicDescription": "No operation sent for execution.  The counter counts every attributable cycle on which no attributable instruction or operation was sent for execution on this processing element.",
-        "EventCode": "0x3C",
-        "EventName": "STALL",
-        "BriefDescription": "No operation sent for execution"
-    },
-    {
-        "PublicDescription": "No operation sent for execution on a slot due to the backend.  Counts each slot counted by STALL_SLOT where no attributable instruction or operation was sent for execution because the backend is unable to accept it.",
-        "EventCode": "0x3D",
-        "EventName": "STALL_SLOT_BACKEND",
-        "BriefDescription": "No operation sent for execution on a slot due to the backend"
-    },
-    {
-        "PublicDescription": "No operation sent for execution on a slot due to the frontend.  Counts each slot counted by STALL_SLOT where no attributable instruction or operation was sent for execution because there was no attributable instruction or operation available to issue from the processing element from the frontend for the slot.",
-        "EventCode": "0x3E",
-        "EventName": "STALL_SLOT_FRONTEND",
-        "BriefDescription": "No operation sent for execution on a slot due to the frontend"
-    },
-    {
-        "PublicDescription": "No operation sent for execution on a slot.  The counter counts on each attributable cycle the number of instruction or operation slots that were not occupied by an instruction or operation attributable to the processing element.",
-        "EventCode": "0x3F",
-        "EventName": "STALL_SLOT",
-        "BriefDescription": "No operation sent for execution on a slot"
-    },
-    {
-        "PublicDescription": "Sample Population",
-        "EventCode": "0x4000",
-        "EventName": "SAMPLE_POP",
-        "BriefDescription": "Sample Population"
-    },
-    {
-        "PublicDescription": "Sample Taken",
-        "EventCode": "0x4001",
-        "EventName": "SAMPLE_FEED",
-        "BriefDescription": "Sample Taken"
-    },
-    {
-        "PublicDescription": "Sample Taken and not removed by filtering",
-        "EventCode": "0x4002",
-        "EventName": "SAMPLE_FILTRATE",
-        "BriefDescription": "Sample Taken and not removed by filtering"
-    },
-    {
-        "PublicDescription": "Sample collided with previous sample",
-        "EventCode": "0x4003",
-        "EventName": "SAMPLE_COLLISION",
-        "BriefDescription": "Sample collided with previous sample"
-    },
-    {
-        "PublicDescription": "Constant frequency cycles.  The counter increments at a constant frequency equal to the rate of increment of the system counter, CNTPCT_EL0.",
-        "EventCode": "0x4004",
-        "EventName": "CNT_CYCLES",
-        "BriefDescription": "Constant frequency cycles"
-    },
-    {
-        "PublicDescription": "Memory stall cycles.  The counter counts each cycle counted by STALL_BACKEND where there is a cache miss in the last level of cache within the processing element clock domain",
-        "EventCode": "0x4005",
-        "EventName": "STALL_BACKEND_MEM",
-        "BriefDescription": "Memory stall cycles"
-    },
-    {
-        "PublicDescription": "Level 1 instruction cache long-latency read miss.  If the L1I_CACHE_RD event is implemented, the counter counts each access counted by L1I_CACHE_RD that incurs additional latency because it returns instructions from outside of the Level 1 instruction cache of this PE.  If the L1I_CACHE_RD event is not implemented, the counter counts each access counted by L1I_CACHE that incurs additional latency because it returns instructions from outside the Level 1 instruction cache of this PE.  The event indicates to software that the access missed in the Level 1 instruction cache and might have a significant performance impact due to the additional latency, compared to the latency of an access that hits in the Level 1 instruction cache.",
-        "EventCode": "0x4006",
-        "EventName": "L1I_CACHE_LMISS",
-        "BriefDescription": "Level 1 instruction cache long-latency read miss"
-    },
-    {
-        "PublicDescription": "Level 2 data cache long-latency read miss.  The counter counts each memory read access counted by L2D_CACHE that incurs additional latency because it returns data from outside the Level 2 data or unified cache of this processing element.  The event indicates to software that the access missed in the Level 2 data or unified cache and might have a significant performance impact compared to the latency of an access that hits in the Level 2 data or unified cache.",
-        "EventCode": "0x4009",
-        "EventName": "L2D_CACHE_LMISS_RD",
-        "BriefDescription": "Level 2 data cache long-latency read miss"
-    },
-    {
-        "PublicDescription": "Level 3 data cache long-latency read miss.  The counter counts each memory read access counted by L3D_CACHE that incurs additional latency because it returns data from outside the Level 3 data or unified cache of this processing element.  The event indicates to software that the access missed in the Level 3 data or unified cache and might have a significant performance impact compared to the latency of an access that hits in the Level 3 data or unified cache.",
-        "EventCode": "0x400B",
-        "EventName": "L3D_CACHE_LMISS_RD",
-        "BriefDescription": "Level 3 data cache long-latency read miss"
-    },
-    {
-        "PublicDescription": "Trace buffer current write pointer wrapped",
-        "EventCode": "0x400C",
-        "EventName": "TRB_WRAP",
-        "BriefDescription": "Trace buffer current write pointer wrapped"
-    },
-    {
-        "PublicDescription": "PE Trace Unit external output 0",
-        "EventCode": "0x4010",
-        "EventName": "TRCEXTOUT0",
-        "BriefDescription": "PE Trace Unit external output 0"
-    },
-    {
-        "PublicDescription": "PE Trace Unit external output 1",
-        "EventCode": "0x4011",
-        "EventName": "TRCEXTOUT1",
-        "BriefDescription": "PE Trace Unit external output 1"
-    },
-    {
-        "PublicDescription": "PE Trace Unit external output 2",
-        "EventCode": "0x4012",
-        "EventName": "TRCEXTOUT2",
-        "BriefDescription": "PE Trace Unit external output 2"
-    },
-    {
-        "PublicDescription": "PE Trace Unit external output 3",
-        "EventCode": "0x4013",
-        "EventName": "TRCEXTOUT3",
-        "BriefDescription": "PE Trace Unit external output 3"
-    },
-    {
-        "PublicDescription": "Cross-trigger Interface output trigger 4",
-        "EventCode": "0x4018",
-        "EventName": "CTI_TRIGOUT4",
-        "BriefDescription": "Cross-trigger Interface output trigger 4"
-    },
-    {
-        "PublicDescription": "Cross-trigger Interface output trigger 5 ",
-        "EventCode": "0x4019",
-        "EventName": "CTI_TRIGOUT5",
-        "BriefDescription": "Cross-trigger Interface output trigger 5 "
-    },
-    {
-        "PublicDescription": "Cross-trigger Interface output trigger 6",
-        "EventCode": "0x401A",
-        "EventName": "CTI_TRIGOUT6",
-        "BriefDescription": "Cross-trigger Interface output trigger 6"
-    },
-    {
-        "PublicDescription": "Cross-trigger Interface output trigger 7",
-        "EventCode": "0x401B",
-        "EventName": "CTI_TRIGOUT7",
-        "BriefDescription": "Cross-trigger Interface output trigger 7"
-    },
-    {
-        "PublicDescription": "Access with additional latency from alignment",
-        "EventCode": "0x4020",
-        "EventName": "LDST_ALIGN_LAT",
-        "BriefDescription": "Access with additional latency from alignment"
-    },
-    {
-        "PublicDescription": "Load with additional latency from alignment",
-        "EventCode": "0x4021",
-        "EventName": "LD_ALIGN_LAT",
-        "BriefDescription": "Load with additional latency from alignment"
-    },
-    {
-        "PublicDescription": "Store with additional latency from alignment",
-        "EventCode": "0x4022",
-        "EventName": "ST_ALIGN_LAT",
-        "BriefDescription": "Store with additional latency from alignment"
-    },
-    {
-        "PublicDescription": "Checked data memory access",
-        "EventCode": "0x4024",
-        "EventName": "MEM_ACCESS_CHECKED",
-        "BriefDescription": "Checked data memory access"
-    },
-    {
-        "PublicDescription": "Checked data memory access, read",
-        "EventCode": "0x4025",
-        "EventName": "MEM_ACCESS_CHECKED_RD",
-        "BriefDescription": "Checked data memory access, read"
-    },
-    {
-        "PublicDescription": "Checked data memory access, write",
-        "EventCode": "0x4026",
-        "EventName": "MEM_ACCESS_CHECKED_WR",
-        "BriefDescription": "Checked data memory access, write"
-    },
-    {
-        "PublicDescription": "SIMD Instruction architecturally executed.",
-        "EventCode": "0x8000",
-        "EventName": "SIMD_INST_RETIRED",
-        "BriefDescription": "SIMD Instruction architecturally executed."
-    },
-    {
-        "PublicDescription": "Instruction architecturally executed, SVE.",
-        "EventCode": "0x8002",
-        "EventName": "SVE_INST_RETIRED",
-        "BriefDescription": "Instruction architecturally executed, SVE."
-    },
-    {
-        "PublicDescription": "ASE operations speculatively executed",
-        "EventCode": "0x8005",
-        "EventName": "ASE_INST_SPEC",
-        "BriefDescription": "ASE operations speculatively executed"
-    },
-    {
-        "PublicDescription": "SVE operations speculatively executed",
-        "EventCode": "0x8006",
-        "EventName": "SVE_INST_SPEC",
-        "BriefDescription": "SVE operations speculatively executed"
-    },
-    {
-        "PublicDescription": "Microarchitectural operation, Operations speculatively executed.",
-        "EventCode": "0x8008",
-        "EventName": "UOP_SPEC",
-        "BriefDescription": "Microarchitectural operation, Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "SVE Math accelerator Operations speculatively executed.",
-        "EventCode": "0x800E",
-        "EventName": "SVE_MATH_SPEC",
-        "BriefDescription": "SVE Math accelerator Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "Floating-point Operations speculatively executed.",
-        "EventCode": "0x8010",
-        "EventName": "FP_SPEC",
-        "BriefDescription": "Floating-point Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "Floating-point half-precision operations speculatively executed",
-        "EventCode": "0x8014",
-        "EventName": "FP_HP_SPEC",
-        "BriefDescription": "Floating-point half-precision operations speculatively executed"
-    },
-    {
-        "PublicDescription": "Floating-point single-precision operations speculatively executed",
-        "EventCode": "0x8018",
-        "EventName": "FP_SP_SPEC",
-        "BriefDescription": "Floating-point single-precision operations speculatively executed"
-    },
-    {
-        "PublicDescription": "Floating-point double-precision operations speculatively executed",
-        "EventCode": "0x801C",
-        "EventName": "FP_DP_SPEC",
-        "BriefDescription": "Floating-point double-precision operations speculatively executed"
-    },
-    {
-        "PublicDescription": "Floating-point FMA Operations speculatively executed.",
-        "EventCode": "0x8028",
-        "EventName": "FP_FMA_SPEC",
-        "BriefDescription": "Floating-point FMA Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "Floating-point reciprocal estimate Operations speculatively executed.",
-        "EventCode": "0x8034",
-        "EventName": "FP_RECPE_SPEC",
-        "BriefDescription": "Floating-point reciprocal estimate Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "floating-point convert Operations speculatively executed.",
-        "EventCode": "0x8038",
-        "EventName": "FP_CVT_SPEC",
-        "BriefDescription": "floating-point convert Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "Advanced SIMD and SVE integer Operations speculatively executed.",
-        "EventCode": "0x8043",
-        "EventName": "ASE_SVE_INT_SPEC",
-        "BriefDescription": "Advanced SIMD and SVE integer Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "SVE predicated Operations speculatively executed.",
-        "EventCode": "0x8074",
-        "EventName": "SVE_PRED_SPEC",
-        "BriefDescription": "SVE predicated Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "SVE predicated operations with no active predicates speculatively executed",
-        "EventCode": "0x8075",
-        "EventName": "SVE_PRED_EMPTY_SPEC",
-        "BriefDescription": "SVE predicated operations with no active predicates speculatively executed"
-    },
-    {
-        "PublicDescription": "SVE predicated operations speculatively executed with all active predicates",
-        "EventCode": "0x8076",
-        "EventName": "SVE_PRED_FULL_SPEC",
-        "BriefDescription": "SVE predicated operations speculatively executed with all active predicates"
-    },
-    {
-        "PublicDescription": "SVE predicated operations speculatively executed with partially active predicates",
-        "EventCode": "0x8077",
-        "EventName": "SVE_PRED_PARTIAL_SPEC",
-        "BriefDescription": "SVE predicated operations speculatively executed with partially active predicates"
-    },
-    {
-        "PublicDescription": "SVE predicated operations with empty or partially active predicates",
-        "EventCode": "0x8079",
-        "EventName": "SVE_PRED_NOT_FULL_SPEC",
-        "BriefDescription": "SVE predicated operations with empty or partially active predicates"
-    },
-    {
-        "PublicDescription": "SVE MOVPRFX Operations speculatively executed.",
-        "EventCode": "0x807C",
-        "EventName": "SVE_MOVPRFX_SPEC",
-        "BriefDescription": "SVE MOVPRFX Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "SVE MOVPRFX unfused Operations speculatively executed.",
-        "EventCode": "0x807F",
-        "EventName": "SVE_MOVPRFX_U_SPEC",
-        "BriefDescription": "SVE MOVPRFX unfused Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "Advanced SIMD and SVE load Operations speculatively executed.",
-        "EventCode": "0x8085",
-        "EventName": "ASE_SVE_LD_SPEC",
-        "BriefDescription": "Advanced SIMD and SVE load Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "Advanced SIMD and SVE store Operations speculatively executed.",
-        "EventCode": "0x8086",
-        "EventName": "ASE_SVE_ST_SPEC",
-        "BriefDescription": "Advanced SIMD and SVE store Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "Prefetch Operations speculatively executed.",
-        "EventCode": "0x8087",
-        "EventName": "PRF_SPEC",
-        "BriefDescription": "Prefetch Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "General-purpose register load Operations speculatively executed.",
-        "EventCode": "0x8089",
-        "EventName": "BASE_LD_REG_SPEC",
-        "BriefDescription": "General-purpose register load Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "General-purpose register store Operations speculatively executed.",
-        "EventCode": "0x808A",
-        "EventName": "BASE_ST_REG_SPEC",
-        "BriefDescription": "General-purpose register store Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "SVE unpredicated load register Operations speculatively executed.",
-        "EventCode": "0x8091",
-        "EventName": "SVE_LDR_REG_SPEC",
-        "BriefDescription": "SVE unpredicated load register Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "SVE unpredicated store register Operations speculatively executed.",
-        "EventCode": "0x8092",
-        "EventName": "SVE_STR_REG_SPEC",
-        "BriefDescription": "SVE unpredicated store register Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "SVE load predicate register Operations speculatively executed.",
-        "EventCode": "0x8095",
-        "EventName": "SVE_LDR_PREG_SPEC",
-        "BriefDescription": "SVE load predicate register Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "SVE store predicate register Operations speculatively executed.",
-        "EventCode": "0x8096",
-        "EventName": "SVE_STR_PREG_SPEC",
-        "BriefDescription": "SVE store predicate register Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "SVE contiguous prefetch element Operations speculatively executed.",
-        "EventCode": "0x809F",
-        "EventName": "SVE_PRF_CONTIG_SPEC",
-        "BriefDescription": "SVE contiguous prefetch element Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "Advanced SIMD and SVE contiguous load multiple vector Operations speculatively executed.",
-        "EventCode": "0x80A5",
-        "EventName": "ASE_SVE_LD_MULTI_SPEC",
-        "BriefDescription": "Advanced SIMD and SVE contiguous load multiple vector Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "Advanced SIMD and SVE contiguous store multiple vector Operations speculatively executed.",
-        "EventCode": "0x80A6",
-        "EventName": "ASE_SVE_ST_MULTI_SPEC",
-        "BriefDescription": "Advanced SIMD and SVE contiguous store multiple vector Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "SVE gather-load Operations speculatively executed.",
-        "EventCode": "0x80AD",
-        "EventName": "SVE_LD_GATHER_SPEC",
-        "BriefDescription": "SVE gather-load Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "SVE scatter-store Operations speculatively executed.",
-        "EventCode": "0x80AE",
-        "EventName": "SVE_ST_SCATTER_SPEC",
-        "BriefDescription": "SVE scatter-store Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "SVE gather-prefetch Operations speculatively executed.",
-        "EventCode": "0x80AF",
-        "EventName": "SVE_PRF_GATHER_SPEC",
-        "BriefDescription": "SVE gather-prefetch Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "SVE First-fault load Operations speculatively executed.",
-        "EventCode": "0x80BC",
-        "EventName": "SVE_LDFF_SPEC",
-        "BriefDescription": "SVE First-fault load Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "SVE first-fault load operations speculatively executed which set FFR bit to 0",
-        "EventCode": "0x80BD",
-        "EventName": "SVE_LDFF_FAULT_SPEC",
-        "BriefDescription": "SVE first-fault load operations speculatively executed which set FFR bit to 0"
-    },
-    {
-        "PublicDescription": "Scalable floating-point element Operations speculatively executed.",
-        "EventCode": "0x80C0",
-        "EventName": "FP_SCALE_OPS_SPEC",
-        "BriefDescription": "Scalable floating-point element Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "Non-scalable floating-point element Operations speculatively executed.",
-        "EventCode": "0x80C1",
-        "EventName": "FP_FIXED_OPS_SPEC",
-        "BriefDescription": "Non-scalable floating-point element Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "Scalable half-precision floating-point element Operations speculatively executed.",
-        "EventCode": "0x80C2",
-        "EventName": "FP_HP_SCALE_OPS_SPEC",
-        "BriefDescription": "Scalable half-precision floating-point element Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "Non-scalable half-precision floating-point element Operations speculatively executed.",
-        "EventCode": "0x80C3",
-        "EventName": "FP_HP_FIXED_OPS_SPEC",
-        "BriefDescription": "Non-scalable half-precision floating-point element Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "Scalable single-precision floating-point element Operations speculatively executed.",
-        "EventCode": "0x80C4",
-        "EventName": "FP_SP_SCALE_OPS_SPEC",
-        "BriefDescription": "Scalable single-precision floating-point element Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "Non-scalable single-precision floating-point element Operations speculatively executed.",
-        "EventCode": "0x80C5",
-        "EventName": "FP_SP_FIXED_OPS_SPEC",
-        "BriefDescription": "Non-scalable single-precision floating-point element Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "Scalable double-precision floating-point element Operations speculatively executed.",
-        "EventCode": "0x80C6",
-        "EventName": "FP_DP_SCALE_OPS_SPEC",
-        "BriefDescription": "Scalable double-precision floating-point element Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "Non-scalable double-precision floating-point element Operations speculatively executed.",
-        "EventCode": "0x80C7",
-        "EventName": "FP_DP_FIXED_OPS_SPEC",
-        "BriefDescription": "Non-scalable double-precision floating-point element Operations speculatively executed."
-    },
-    {
-        "PublicDescription": "Advanced SIMD and SVE 8-bit integer operations speculatively executed",
-        "EventCode": "0x80E3",
-        "EventName": "ASE_SVE_INT8_SPEC",
-        "BriefDescription": "Advanced SIMD and SVE 8-bit integer operations speculatively executed"
-    },
-    {
-        "PublicDescription": "Advanced SIMD and SVE 16-bit integer operations speculatively executed",
-        "EventCode": "0x80E7",
-        "EventName": "ASE_SVE_INT16_SPEC",
-        "BriefDescription": "Advanced SIMD and SVE 16-bit integer operations speculatively executed"
-    },
-    {
-        "PublicDescription": "Advanced SIMD and SVE 32-bit integer operations speculatively executed",
-        "EventCode": "0x80EB",
-        "EventName": "ASE_SVE_INT32_SPEC",
-        "BriefDescription": "Advanced SIMD and SVE 32-bit integer operations speculatively executed"
-    },
-    {
-        "PublicDescription": "Advanced SIMD and SVE 64-bit integer operations speculatively executed",
-        "EventCode": "0x80EF",
-        "EventName": "ASE_SVE_INT64_SPEC",
-        "BriefDescription": "Advanced SIMD and SVE 64-bit integer operations speculatively executed"
-    }
-]
diff --git a/tools/perf/pmu-events/arch/arm64/armv8-recommended.json b/tools/perf/pmu-events/arch/arm64/armv8-recommended.json
deleted file mode 100644
index 210afa856091..000000000000
--- a/tools/perf/pmu-events/arch/arm64/armv8-recommended.json
+++ /dev/null
@@ -1,452 +0,0 @@
-[
-    {
-        "PublicDescription": "Attributable Level 1 data cache access, read",
-        "EventCode": "0x40",
-        "EventName": "L1D_CACHE_RD",
-        "BriefDescription": "L1D cache access, read"
-    },
-    {
-        "PublicDescription": "Attributable Level 1 data cache access, write",
-        "EventCode": "0x41",
-        "EventName": "L1D_CACHE_WR",
-        "BriefDescription": "L1D cache access, write"
-    },
-    {
-        "PublicDescription": "Attributable Level 1 data cache refill, read",
-        "EventCode": "0x42",
-        "EventName": "L1D_CACHE_REFILL_RD",
-        "BriefDescription": "L1D cache refill, read"
-    },
-    {
-        "PublicDescription": "Attributable Level 1 data cache refill, write",
-        "EventCode": "0x43",
-        "EventName": "L1D_CACHE_REFILL_WR",
-        "BriefDescription": "L1D cache refill, write"
-    },
-    {
-        "PublicDescription": "Attributable Level 1 data cache refill, inner",
-        "EventCode": "0x44",
-        "EventName": "L1D_CACHE_REFILL_INNER",
-        "BriefDescription": "L1D cache refill, inner"
-    },
-    {
-        "PublicDescription": "Attributable Level 1 data cache refill, outer",
-        "EventCode": "0x45",
-        "EventName": "L1D_CACHE_REFILL_OUTER",
-        "BriefDescription": "L1D cache refill, outer"
-    },
-    {
-        "PublicDescription": "Attributable Level 1 data cache Write-Back, victim",
-        "EventCode": "0x46",
-        "EventName": "L1D_CACHE_WB_VICTIM",
-        "BriefDescription": "L1D cache Write-Back, victim"
-    },
-    {
-        "PublicDescription": "Level 1 data cache Write-Back, cleaning and coherency",
-        "EventCode": "0x47",
-        "EventName": "L1D_CACHE_WB_CLEAN",
-        "BriefDescription": "L1D cache Write-Back, cleaning and coherency"
-    },
-    {
-        "PublicDescription": "Attributable Level 1 data cache invalidate",
-        "EventCode": "0x48",
-        "EventName": "L1D_CACHE_INVAL",
-        "BriefDescription": "L1D cache invalidate"
-    },
-    {
-        "PublicDescription": "Attributable Level 1 data TLB refill, read",
-        "EventCode": "0x4C",
-        "EventName": "L1D_TLB_REFILL_RD",
-        "BriefDescription": "L1D tlb refill, read"
-    },
-    {
-        "PublicDescription": "Attributable Level 1 data TLB refill, write",
-        "EventCode": "0x4D",
-        "EventName": "L1D_TLB_REFILL_WR",
-        "BriefDescription": "L1D tlb refill, write"
-    },
-    {
-        "PublicDescription": "Attributable Level 1 data or unified TLB access, read",
-        "EventCode": "0x4E",
-        "EventName": "L1D_TLB_RD",
-        "BriefDescription": "L1D tlb access, read"
-    },
-    {
-        "PublicDescription": "Attributable Level 1 data or unified TLB access, write",
-        "EventCode": "0x4F",
-        "EventName": "L1D_TLB_WR",
-        "BriefDescription": "L1D tlb access, write"
-    },
-    {
-        "PublicDescription": "Attributable Level 2 data cache access, read",
-        "EventCode": "0x50",
-        "EventName": "L2D_CACHE_RD",
-        "BriefDescription": "L2D cache access, read"
-    },
-    {
-        "PublicDescription": "Attributable Level 2 data cache access, write",
-        "EventCode": "0x51",
-        "EventName": "L2D_CACHE_WR",
-        "BriefDescription": "L2D cache access, write"
-    },
-    {
-        "PublicDescription": "Attributable Level 2 data cache refill, read",
-        "EventCode": "0x52",
-        "EventName": "L2D_CACHE_REFILL_RD",
-        "BriefDescription": "L2D cache refill, read"
-    },
-    {
-        "PublicDescription": "Attributable Level 2 data cache refill, write",
-        "EventCode": "0x53",
-        "EventName": "L2D_CACHE_REFILL_WR",
-        "BriefDescription": "L2D cache refill, write"
-    },
-    {
-        "PublicDescription": "Attributable Level 2 data cache Write-Back, victim",
-        "EventCode": "0x56",
-        "EventName": "L2D_CACHE_WB_VICTIM",
-        "BriefDescription": "L2D cache Write-Back, victim"
-    },
-    {
-        "PublicDescription": "Level 2 data cache Write-Back, cleaning and coherency",
-        "EventCode": "0x57",
-        "EventName": "L2D_CACHE_WB_CLEAN",
-        "BriefDescription": "L2D cache Write-Back, cleaning and coherency"
-    },
-    {
-        "PublicDescription": "Attributable Level 2 data cache invalidate",
-        "EventCode": "0x58",
-        "EventName": "L2D_CACHE_INVAL",
-        "BriefDescription": "L2D cache invalidate"
-    },
-    {
-        "PublicDescription": "Attributable Level 2 data or unified TLB refill, read",
-        "EventCode": "0x5c",
-        "EventName": "L2D_TLB_REFILL_RD",
-        "BriefDescription": "L2D cache refill, read"
-    },
-    {
-        "PublicDescription": "Attributable Level 2 data or unified TLB refill, write",
-        "EventCode": "0x5d",
-        "EventName": "L2D_TLB_REFILL_WR",
-        "BriefDescription": "L2D cache refill, write"
-    },
-    {
-        "PublicDescription": "Attributable Level 2 data or unified TLB access, read",
-        "EventCode": "0x5e",
-        "EventName": "L2D_TLB_RD",
-        "BriefDescription": "L2D cache access, read"
-    },
-    {
-        "PublicDescription": "Attributable Level 2 data or unified TLB access, write",
-        "EventCode": "0x5f",
-        "EventName": "L2D_TLB_WR",
-        "BriefDescription": "L2D cache access, write"
-    },
-    {
-        "PublicDescription": "Bus access read",
-        "EventCode": "0x60",
-        "EventName": "BUS_ACCESS_RD",
-        "BriefDescription": "Bus access read"
-    },
-    {
-        "PublicDescription": "Bus access write",
-        "EventCode": "0x61",
-        "EventName": "BUS_ACCESS_WR",
-        "BriefDescription": "Bus access write"
-    },
-    {
-        "PublicDescription": "Bus access, Normal, Cacheable, Shareable",
-        "EventCode": "0x62",
-        "EventName": "BUS_ACCESS_SHARED",
-        "BriefDescription": "Bus access, Normal, Cacheable, Shareable"
-    },
-    {
-        "PublicDescription": "Bus access, not Normal, Cacheable, Shareable",
-        "EventCode": "0x63",
-        "EventName": "BUS_ACCESS_NOT_SHARED",
-        "BriefDescription": "Bus access, not Normal, Cacheable, Shareable"
-    },
-    {
-        "PublicDescription": "Bus access, Normal",
-        "EventCode": "0x64",
-        "EventName": "BUS_ACCESS_NORMAL",
-        "BriefDescription": "Bus access, Normal"
-    },
-    {
-        "PublicDescription": "Bus access, peripheral",
-        "EventCode": "0x65",
-        "EventName": "BUS_ACCESS_PERIPH",
-        "BriefDescription": "Bus access, peripheral"
-    },
-    {
-        "PublicDescription": "Data memory access, read",
-        "EventCode": "0x66",
-        "EventName": "MEM_ACCESS_RD",
-        "BriefDescription": "Data memory access, read"
-    },
-    {
-        "PublicDescription": "Data memory access, write",
-        "EventCode": "0x67",
-        "EventName": "MEM_ACCESS_WR",
-        "BriefDescription": "Data memory access, write"
-    },
-    {
-        "PublicDescription": "Unaligned access, read",
-        "EventCode": "0x68",
-        "EventName": "UNALIGNED_LD_SPEC",
-        "BriefDescription": "Unaligned access, read"
-    },
-    {
-        "PublicDescription": "Unaligned access, write",
-        "EventCode": "0x69",
-        "EventName": "UNALIGNED_ST_SPEC",
-        "BriefDescription": "Unaligned access, write"
-    },
-    {
-        "PublicDescription": "Unaligned access",
-        "EventCode": "0x6a",
-        "EventName": "UNALIGNED_LDST_SPEC",
-        "BriefDescription": "Unaligned access"
-    },
-    {
-        "PublicDescription": "Exclusive operation speculatively executed, LDREX or LDX",
-        "EventCode": "0x6c",
-        "EventName": "LDREX_SPEC",
-        "BriefDescription": "Exclusive operation speculatively executed, LDREX or LDX"
-    },
-    {
-        "PublicDescription": "Exclusive operation speculatively executed, STREX or STX pass",
-        "EventCode": "0x6d",
-        "EventName": "STREX_PASS_SPEC",
-        "BriefDescription": "Exclusive operation speculatively executed, STREX or STX pass"
-    },
-    {
-        "PublicDescription": "Exclusive operation speculatively executed, STREX or STX fail",
-        "EventCode": "0x6e",
-        "EventName": "STREX_FAIL_SPEC",
-        "BriefDescription": "Exclusive operation speculatively executed, STREX or STX fail"
-    },
-    {
-        "PublicDescription": "Exclusive operation speculatively executed, STREX or STX",
-        "EventCode": "0x6f",
-        "EventName": "STREX_SPEC",
-        "BriefDescription": "Exclusive operation speculatively executed, STREX or STX"
-    },
-    {
-        "PublicDescription": "Operation speculatively executed, load",
-        "EventCode": "0x70",
-        "EventName": "LD_SPEC",
-        "BriefDescription": "Operation speculatively executed, load"
-    },
-    {
-        "PublicDescription": "Operation speculatively executed, store",
-        "EventCode": "0x71",
-        "EventName": "ST_SPEC",
-        "BriefDescription": "Operation speculatively executed, store"
-    },
-    {
-        "PublicDescription": "Operation speculatively executed, load or store",
-        "EventCode": "0x72",
-        "EventName": "LDST_SPEC",
-        "BriefDescription": "Operation speculatively executed, load or store"
-    },
-    {
-        "PublicDescription": "Operation speculatively executed, integer data processing",
-        "EventCode": "0x73",
-        "EventName": "DP_SPEC",
-        "BriefDescription": "Operation speculatively executed, integer data processing"
-    },
-    {
-        "PublicDescription": "Operation speculatively executed, Advanced SIMD instruction",
-        "EventCode": "0x74",
-        "EventName": "ASE_SPEC",
-        "BriefDescription": "Operation speculatively executed, Advanced SIMD instruction"
-    },
-    {
-        "PublicDescription": "Operation speculatively executed, floating-point instruction",
-        "EventCode": "0x75",
-        "EventName": "VFP_SPEC",
-        "BriefDescription": "Operation speculatively executed, floating-point instruction"
-    },
-    {
-        "PublicDescription": "Operation speculatively executed, software change of the PC",
-        "EventCode": "0x76",
-        "EventName": "PC_WRITE_SPEC",
-        "BriefDescription": "Operation speculatively executed, software change of the PC"
-    },
-    {
-        "PublicDescription": "Operation speculatively executed, Cryptographic instruction",
-        "EventCode": "0x77",
-        "EventName": "CRYPTO_SPEC",
-        "BriefDescription": "Operation speculatively executed, Cryptographic instruction"
-    },
-    {
-        "PublicDescription": "Branch speculatively executed, immediate branch",
-        "EventCode": "0x78",
-        "EventName": "BR_IMMED_SPEC",
-        "BriefDescription": "Branch speculatively executed, immediate branch"
-    },
-    {
-        "PublicDescription": "Branch speculatively executed, procedure return",
-        "EventCode": "0x79",
-        "EventName": "BR_RETURN_SPEC",
-        "BriefDescription": "Branch speculatively executed, procedure return"
-    },
-    {
-        "PublicDescription": "Branch speculatively executed, indirect branch",
-        "EventCode": "0x7a",
-        "EventName": "BR_INDIRECT_SPEC",
-        "BriefDescription": "Branch speculatively executed, indirect branch"
-    },
-    {
-        "PublicDescription": "Barrier speculatively executed, ISB",
-        "EventCode": "0x7c",
-        "EventName": "ISB_SPEC",
-        "BriefDescription": "Barrier speculatively executed, ISB"
-    },
-    {
-        "PublicDescription": "Barrier speculatively executed, DSB",
-        "EventCode": "0x7d",
-        "EventName": "DSB_SPEC",
-        "BriefDescription": "Barrier speculatively executed, DSB"
-    },
-    {
-        "PublicDescription": "Barrier speculatively executed, DMB",
-        "EventCode": "0x7e",
-        "EventName": "DMB_SPEC",
-        "BriefDescription": "Barrier speculatively executed, DMB"
-    },
-    {
-        "PublicDescription": "Exception taken, Other synchronous",
-        "EventCode": "0x81",
-        "EventName": "EXC_UNDEF",
-        "BriefDescription": "Exception taken, Other synchronous"
-    },
-    {
-        "PublicDescription": "Exception taken, Supervisor Call",
-        "EventCode": "0x82",
-        "EventName": "EXC_SVC",
-        "BriefDescription": "Exception taken, Supervisor Call"
-    },
-    {
-        "PublicDescription": "Exception taken, Instruction Abort",
-        "EventCode": "0x83",
-        "EventName": "EXC_PABORT",
-        "BriefDescription": "Exception taken, Instruction Abort"
-    },
-    {
-        "PublicDescription": "Exception taken, Data Abort and SError",
-        "EventCode": "0x84",
-        "EventName": "EXC_DABORT",
-        "BriefDescription": "Exception taken, Data Abort and SError"
-    },
-    {
-        "PublicDescription": "Exception taken, IRQ",
-        "EventCode": "0x86",
-        "EventName": "EXC_IRQ",
-        "BriefDescription": "Exception taken, IRQ"
-    },
-    {
-        "PublicDescription": "Exception taken, FIQ",
-        "EventCode": "0x87",
-        "EventName": "EXC_FIQ",
-        "BriefDescription": "Exception taken, FIQ"
-    },
-    {
-        "PublicDescription": "Exception taken, Secure Monitor Call",
-        "EventCode": "0x88",
-        "EventName": "EXC_SMC",
-        "BriefDescription": "Exception taken, Secure Monitor Call"
-    },
-    {
-        "PublicDescription": "Exception taken, Hypervisor Call",
-        "EventCode": "0x8a",
-        "EventName": "EXC_HVC",
-        "BriefDescription": "Exception taken, Hypervisor Call"
-    },
-    {
-        "PublicDescription": "Exception taken, Instruction Abort not taken locally",
-        "EventCode": "0x8b",
-        "EventName": "EXC_TRAP_PABORT",
-        "BriefDescription": "Exception taken, Instruction Abort not taken locally"
-    },
-    {
-        "PublicDescription": "Exception taken, Data Abort or SError not taken locally",
-        "EventCode": "0x8c",
-        "EventName": "EXC_TRAP_DABORT",
-        "BriefDescription": "Exception taken, Data Abort or SError not taken locally"
-    },
-    {
-        "PublicDescription": "Exception taken, Other traps not taken locally",
-        "EventCode": "0x8d",
-        "EventName": "EXC_TRAP_OTHER",
-        "BriefDescription": "Exception taken, Other traps not taken locally"
-    },
-    {
-        "PublicDescription": "Exception taken, IRQ not taken locally",
-        "EventCode": "0x8e",
-        "EventName": "EXC_TRAP_IRQ",
-        "BriefDescription": "Exception taken, IRQ not taken locally"
-    },
-    {
-        "PublicDescription": "Exception taken, FIQ not taken locally",
-        "EventCode": "0x8f",
-        "EventName": "EXC_TRAP_FIQ",
-        "BriefDescription": "Exception taken, FIQ not taken locally"
-    },
-    {
-        "PublicDescription": "Release consistency operation speculatively executed, Load-Acquire",
-        "EventCode": "0x90",
-        "EventName": "RC_LD_SPEC",
-        "BriefDescription": "Release consistency operation speculatively executed, Load-Acquire"
-    },
-    {
-        "PublicDescription": "Release consistency operation speculatively executed, Store-Release",
-        "EventCode": "0x91",
-        "EventName": "RC_ST_SPEC",
-        "BriefDescription": "Release consistency operation speculatively executed, Store-Release"
-    },
-    {
-        "PublicDescription": "Attributable Level 3 data or unified cache access, read",
-        "EventCode": "0xa0",
-        "EventName": "L3D_CACHE_RD",
-        "BriefDescription": "Attributable Level 3 data or unified cache access, read"
-    },
-    {
-        "PublicDescription": "Attributable Level 3 data or unified cache access, write",
-        "EventCode": "0xa1",
-        "EventName": "L3D_CACHE_WR",
-        "BriefDescription": "Attributable Level 3 data or unified cache access, write"
-    },
-    {
-        "PublicDescription": "Attributable Level 3 data or unified cache refill, read",
-        "EventCode": "0xa2",
-        "EventName": "L3D_CACHE_REFILL_RD",
-        "BriefDescription": "Attributable Level 3 data or unified cache refill, read"
-    },
-    {
-        "PublicDescription": "Attributable Level 3 data or unified cache refill, write",
-        "EventCode": "0xa3",
-        "EventName": "L3D_CACHE_REFILL_WR",
-        "BriefDescription": "Attributable Level 3 data or unified cache refill, write"
-    },
-    {
-        "PublicDescription": "Attributable Level 3 data or unified cache Write-Back, victim",
-        "EventCode": "0xa6",
-        "EventName": "L3D_CACHE_WB_VICTIM",
-        "BriefDescription": "Attributable Level 3 data or unified cache Write-Back, victim"
-    },
-    {
-        "PublicDescription": "Attributable Level 3 data or unified cache Write-Back, cache clean",
-        "EventCode": "0xa7",
-        "EventName": "L3D_CACHE_WB_CLEAN",
-        "BriefDescription": "Attributable Level 3 data or unified cache Write-Back, cache clean"
-    },
-    {
-        "PublicDescription": "Attributable Level 3 data or unified cache access, invalidate",
-        "EventCode": "0xa8",
-        "EventName": "L3D_CACHE_INVAL",
-        "BriefDescription": "Attributable Level 3 data or unified cache access, invalidate"
-    }
-]
diff --git a/tools/perf/pmu-events/arch/arm64/common-and-microarch.json b/tools/perf/pmu-events/arch/arm64/common-and-microarch.json
new file mode 100644
index 000000000000..80d7a70829a0
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/common-and-microarch.json
@@ -0,0 +1,746 @@
+[
+    {
+        "PublicDescription": "Instruction architecturally executed, Condition code check pass, software increment",
+        "EventCode": "0x00",
+        "EventName": "SW_INCR",
+        "BriefDescription": "Instruction architecturally executed, Condition code check pass, software increment"
+    },
+    {
+        "PublicDescription": "Level 1 instruction cache refill",
+        "EventCode": "0x01",
+        "EventName": "L1I_CACHE_REFILL",
+        "BriefDescription": "Level 1 instruction cache refill"
+    },
+    {
+        "PublicDescription": "Attributable Level 1 instruction TLB refill",
+        "EventCode": "0x02",
+        "EventName": "L1I_TLB_REFILL",
+        "BriefDescription": "Attributable Level 1 instruction TLB refill"
+    },
+    {
+        "PublicDescription": "Level 1 data cache refill",
+        "EventCode": "0x03",
+        "EventName": "L1D_CACHE_REFILL",
+        "BriefDescription": "Level 1 data cache refill"
+    },
+    {
+        "PublicDescription": "Level 1 data cache access",
+        "EventCode": "0x04",
+        "EventName": "L1D_CACHE",
+        "BriefDescription": "Level 1 data cache access"
+    },
+    {
+        "PublicDescription": "Attributable Level 1 data TLB refill",
+        "EventCode": "0x05",
+        "EventName": "L1D_TLB_REFILL",
+        "BriefDescription": "Attributable Level 1 data TLB refill"
+    },
+    {
+        "PublicDescription": "Instruction architecturally executed",
+        "EventCode": "0x08",
+        "EventName": "INST_RETIRED",
+        "BriefDescription": "Instruction architecturally executed"
+    },
+    {
+        "PublicDescription": "Exception taken",
+        "EventCode": "0x09",
+        "EventName": "EXC_TAKEN",
+        "BriefDescription": "Exception taken"
+    },
+    {
+        "PublicDescription": "Instruction architecturally executed, condition check pass, exception return",
+        "EventCode": "0x0a",
+        "EventName": "EXC_RETURN",
+        "BriefDescription": "Instruction architecturally executed, condition check pass, exception return"
+    },
+    {
+        "PublicDescription": "Instruction architecturally executed, condition code check pass, write to CONTEXTIDR",
+        "EventCode": "0x0b",
+        "EventName": "CID_WRITE_RETIRED",
+        "BriefDescription": "Instruction architecturally executed, condition code check pass, write to CONTEXTIDR"
+    },
+    {
+        "PublicDescription": "Mispredicted or not predicted branch speculatively executed",
+        "EventCode": "0x10",
+        "EventName": "BR_MIS_PRED",
+        "BriefDescription": "Mispredicted or not predicted branch speculatively executed"
+    },
+    {
+        "PublicDescription": "Cycle",
+        "EventCode": "0x11",
+        "EventName": "CPU_CYCLES",
+        "BriefDescription": "Cycle"
+    },
+    {
+        "PublicDescription": "Predictable branch speculatively executed",
+        "EventCode": "0x12",
+        "EventName": "BR_PRED",
+        "BriefDescription": "Predictable branch speculatively executed"
+    },
+    {
+        "PublicDescription": "Data memory access",
+        "EventCode": "0x13",
+        "EventName": "MEM_ACCESS",
+        "BriefDescription": "Data memory access"
+    },
+    {
+        "PublicDescription": "Attributable Level 1 instruction cache access",
+        "EventCode": "0x14",
+        "EventName": "L1I_CACHE",
+        "BriefDescription": "Attributable Level 1 instruction cache access"
+    },
+    {
+        "PublicDescription": "Attributable Level 1 data cache write-back",
+        "EventCode": "0x15",
+        "EventName": "L1D_CACHE_WB",
+        "BriefDescription": "Attributable Level 1 data cache write-back"
+    },
+    {
+        "PublicDescription": "Level 2 data cache access",
+        "EventCode": "0x16",
+        "EventName": "L2D_CACHE",
+        "BriefDescription": "Level 2 data cache access"
+    },
+    {
+        "PublicDescription": "Level 2 data refill",
+        "EventCode": "0x17",
+        "EventName": "L2D_CACHE_REFILL",
+        "BriefDescription": "Level 2 data refill"
+    },
+    {
+        "PublicDescription": "Attributable Level 2 data cache write-back",
+        "EventCode": "0x18",
+        "EventName": "L2D_CACHE_WB",
+        "BriefDescription": "Attributable Level 2 data cache write-back"
+    },
+    {
+        "PublicDescription": "Attributable Bus access",
+        "EventCode": "0x19",
+        "EventName": "BUS_ACCESS",
+        "BriefDescription": "Attributable Bus access"
+    },
+    {
+        "PublicDescription": "Local memory error",
+        "EventCode": "0x1a",
+        "EventName": "MEMORY_ERROR",
+        "BriefDescription": "Local memory error"
+    },
+    {
+        "PublicDescription": "Operation speculatively executed",
+        "EventCode": "0x1b",
+        "EventName": "INST_SPEC",
+        "BriefDescription": "Operation speculatively executed"
+    },
+    {
+        "PublicDescription": "Instruction architecturally executed, Condition code check pass, write to TTBR",
+        "EventCode": "0x1c",
+        "EventName": "TTBR_WRITE_RETIRED",
+        "BriefDescription": "Instruction architecturally executed, Condition code check pass, write to TTBR"
+    },
+    {
+        "PublicDescription": "Bus cycle",
+        "EventCode": "0x1D",
+        "EventName": "BUS_CYCLES",
+        "BriefDescription": "Bus cycle"
+    },
+    {
+        "PublicDescription": "Attributable Level 2 data cache allocation without refill",
+        "EventCode": "0x20",
+        "EventName": "L2D_CACHE_ALLOCATE",
+        "BriefDescription": "Attributable Level 2 data cache allocation without refill"
+    },
+    {
+        "PublicDescription": "Instruction architecturally executed, branch",
+        "EventCode": "0x21",
+        "EventName": "BR_RETIRED",
+        "BriefDescription": "Instruction architecturally executed, branch"
+    },
+    {
+        "PublicDescription": "Instruction architecturally executed, mispredicted branch",
+        "EventCode": "0x22",
+        "EventName": "BR_MIS_PRED_RETIRED",
+        "BriefDescription": "Instruction architecturally executed, mispredicted branch"
+    },
+    {
+        "PublicDescription": "No operation issued because of the frontend",
+        "EventCode": "0x23",
+        "EventName": "STALL_FRONTEND",
+        "BriefDescription": "No operation issued because of the frontend"
+    },
+    {
+        "PublicDescription": "No operation issued due to the backend",
+        "EventCode": "0x24",
+        "EventName": "STALL_BACKEND",
+        "BriefDescription": "No operation issued due to the backend"
+    },
+    {
+        "PublicDescription": "Attributable Level 1 data or unified TLB access",
+        "EventCode": "0x25",
+        "EventName": "L1D_TLB",
+        "BriefDescription": "Attributable Level 1 data or unified TLB access"
+    },
+    {
+        "PublicDescription": "Attributable Level 1 instruction TLB access",
+        "EventCode": "0x26",
+        "EventName": "L1I_TLB",
+        "BriefDescription": "Attributable Level 1 instruction TLB access"
+    },
+    {
+        "PublicDescription": "Attributable Level 3 data cache allocation without refill",
+        "EventCode": "0x29",
+        "EventName": "L3D_CACHE_ALLOCATE",
+        "BriefDescription": "Attributable Level 3 data cache allocation without refill"
+    },
+    {
+        "PublicDescription": "Attributable Level 3 data cache refill",
+        "EventCode": "0x2A",
+        "EventName": "L3D_CACHE_REFILL",
+        "BriefDescription": "Attributable Level 3 data cache refill"
+    },
+    {
+        "PublicDescription": "Attributable Level 3 data cache access",
+        "EventCode": "0x2B",
+        "EventName": "L3D_CACHE",
+        "BriefDescription": "Attributable Level 3 data cache access"
+    },
+    {
+        "PublicDescription": "Attributable Level 2 data TLB refill",
+        "EventCode": "0x2D",
+        "EventName": "L2D_TLB_REFILL",
+        "BriefDescription": "Attributable Level 2 data TLB refill"
+    },
+    {
+        "PublicDescription": "Attributable Level 2 instruction TLB refill.",
+        "EventCode": "0x2E",
+        "EventName": "L2I_TLB_REFILL",
+        "BriefDescription": "Attributable Level 2 instruction TLB refill."
+    },
+    {
+        "PublicDescription": "Attributable Level 2 data or unified TLB access",
+        "EventCode": "0x2F",
+        "EventName": "L2D_TLB",
+        "BriefDescription": "Attributable Level 2 data or unified TLB access"
+    },
+    {
+        "PublicDescription": "Attributable Level 2 instruction TLB access.",
+        "EventCode": "0x30",
+        "EventName": "L2I_TLB",
+        "BriefDescription": "Attributable Level 2 instruction TLB access."
+    },
+    {
+        "PublicDescription": "Access to another socket in a multi-socket system",
+        "EventCode": "0x31",
+        "EventName": "REMOTE_ACCESS",
+        "BriefDescription": "Access to another socket in a multi-socket system"
+    },
+    {
+        "PublicDescription": "Access to data TLB causes a translation table walk",
+        "EventCode": "0x34",
+        "EventName": "DTLB_WALK",
+        "BriefDescription": "Access to data TLB causes a translation table walk"
+    },
+    {
+        "PublicDescription": "Access to instruction TLB that causes a translation table walk",
+        "EventCode": "0x35",
+        "EventName": "ITLB_WALK",
+        "BriefDescription": "Access to instruction TLB that causes a translation table walk"
+    },
+    {
+        "PublicDescription": "Attributable Last level cache memory read",
+        "EventCode": "0x36",
+        "EventName": "LL_CACHE_RD",
+        "BriefDescription": "Attributable Last level cache memory read"
+    },
+    {
+        "PublicDescription": "Last level cache miss, read",
+        "EventCode": "0x37",
+        "EventName": "LL_CACHE_MISS_RD",
+        "BriefDescription": "Last level cache miss, read"
+    },
+    {
+        "PublicDescription": "Level 1 data cache long-latency read miss.  The counter counts each memory read access counted by L1D_CACHE that incurs additional latency because it returns data from outside the Level 1 data or unified cache of this processing element.",
+        "EventCode": "0x39",
+        "EventName": "L1D_CACHE_LMISS_RD",
+        "BriefDescription": "Level 1 data cache long-latency read miss"
+    },
+    {
+        "PublicDescription": "Micro-operation architecturally executed.  The counter counts each operation counted by OP_SPEC that would be executed in a simple sequential execution of the program.",
+        "EventCode": "0x3A",
+        "EventName": "OP_RETIRED",
+        "BriefDescription": "Micro-operation architecturally executed"
+    },
+    {
+        "PublicDescription": "Micro-operation speculatively executed.  The counter counts the number of operations executed by the processing element, including those that are executed speculatively and would not be executed in a simple sequential execution of the program.",
+        "EventCode": "0x3B",
+        "EventName": "OP_SPEC",
+        "BriefDescription": "Micro-operation speculatively executed"
+    },
+    {
+        "PublicDescription": "No operation sent for execution.  The counter counts every attributable cycle on which no attributable instruction or operation was sent for execution on this processing element.",
+        "EventCode": "0x3C",
+        "EventName": "STALL",
+        "BriefDescription": "No operation sent for execution"
+    },
+    {
+        "PublicDescription": "No operation sent for execution on a slot due to the backend.  Counts each slot counted by STALL_SLOT where no attributable instruction or operation was sent for execution because the backend is unable to accept it.",
+        "EventCode": "0x3D",
+        "EventName": "STALL_SLOT_BACKEND",
+        "BriefDescription": "No operation sent for execution on a slot due to the backend"
+    },
+    {
+        "PublicDescription": "No operation sent for execution on a slot due to the frontend.  Counts each slot counted by STALL_SLOT where no attributable instruction or operation was sent for execution because there was no attributable instruction or operation available to issue from the processing element from the frontend for the slot.",
+        "EventCode": "0x3E",
+        "EventName": "STALL_SLOT_FRONTEND",
+        "BriefDescription": "No operation sent for execution on a slot due to the frontend"
+    },
+    {
+        "PublicDescription": "No operation sent for execution on a slot.  The counter counts on each attributable cycle the number of instruction or operation slots that were not occupied by an instruction or operation attributable to the processing element.",
+        "EventCode": "0x3F",
+        "EventName": "STALL_SLOT",
+        "BriefDescription": "No operation sent for execution on a slot"
+    },
+    {
+        "PublicDescription": "Sample Population",
+        "EventCode": "0x4000",
+        "EventName": "SAMPLE_POP",
+        "BriefDescription": "Sample Population"
+    },
+    {
+        "PublicDescription": "Sample Taken",
+        "EventCode": "0x4001",
+        "EventName": "SAMPLE_FEED",
+        "BriefDescription": "Sample Taken"
+    },
+    {
+        "PublicDescription": "Sample Taken and not removed by filtering",
+        "EventCode": "0x4002",
+        "EventName": "SAMPLE_FILTRATE",
+        "BriefDescription": "Sample Taken and not removed by filtering"
+    },
+    {
+        "PublicDescription": "Sample collided with previous sample",
+        "EventCode": "0x4003",
+        "EventName": "SAMPLE_COLLISION",
+        "BriefDescription": "Sample collided with previous sample"
+    },
+    {
+        "PublicDescription": "Constant frequency cycles.  The counter increments at a constant frequency equal to the rate of increment of the system counter, CNTPCT_EL0.",
+        "EventCode": "0x4004",
+        "EventName": "CNT_CYCLES",
+        "BriefDescription": "Constant frequency cycles"
+    },
+    {
+        "PublicDescription": "Memory stall cycles.  The counter counts each cycle counted by STALL_BACKEND where there is a cache miss in the last level of cache within the processing element clock domain",
+        "EventCode": "0x4005",
+        "EventName": "STALL_BACKEND_MEM",
+        "BriefDescription": "Memory stall cycles"
+    },
+    {
+        "PublicDescription": "Level 1 instruction cache long-latency read miss.  If the L1I_CACHE_RD event is implemented, the counter counts each access counted by L1I_CACHE_RD that incurs additional latency because it returns instructions from outside of the Level 1 instruction cache of this PE.  If the L1I_CACHE_RD event is not implemented, the counter counts each access counted by L1I_CACHE that incurs additional latency because it returns instructions from outside the Level 1 instruction cache of this PE.  The event indicates to software that the access missed in the Level 1 instruction cache and might have a significant performance impact due to the additional latency, compared to the latency of an access that hits in the Level 1 instruction cache.",
+        "EventCode": "0x4006",
+        "EventName": "L1I_CACHE_LMISS",
+        "BriefDescription": "Level 1 instruction cache long-latency read miss"
+    },
+    {
+        "PublicDescription": "Level 2 data cache long-latency read miss.  The counter counts each memory read access counted by L2D_CACHE that incurs additional latency because it returns data from outside the Level 2 data or unified cache of this processing element.  The event indicates to software that the access missed in the Level 2 data or unified cache and might have a significant performance impact compared to the latency of an access that hits in the Level 2 data or unified cache.",
+        "EventCode": "0x4009",
+        "EventName": "L2D_CACHE_LMISS_RD",
+        "BriefDescription": "Level 2 data cache long-latency read miss"
+    },
+    {
+        "PublicDescription": "Level 3 data cache long-latency read miss.  The counter counts each memory read access counted by L3D_CACHE that incurs additional latency because it returns data from outside the Level 3 data or unified cache of this processing element.  The event indicates to software that the access missed in the Level 3 data or unified cache and might have a significant performance impact compared to the latency of an access that hits in the Level 3 data or unified cache.",
+        "EventCode": "0x400B",
+        "EventName": "L3D_CACHE_LMISS_RD",
+        "BriefDescription": "Level 3 data cache long-latency read miss"
+    },
+    {
+        "PublicDescription": "Trace buffer current write pointer wrapped",
+        "EventCode": "0x400C",
+        "EventName": "TRB_WRAP",
+        "BriefDescription": "Trace buffer current write pointer wrapped"
+    },
+    {
+        "PublicDescription": "PE Trace Unit external output 0",
+        "EventCode": "0x4010",
+        "EventName": "TRCEXTOUT0",
+        "BriefDescription": "PE Trace Unit external output 0"
+    },
+    {
+        "PublicDescription": "PE Trace Unit external output 1",
+        "EventCode": "0x4011",
+        "EventName": "TRCEXTOUT1",
+        "BriefDescription": "PE Trace Unit external output 1"
+    },
+    {
+        "PublicDescription": "PE Trace Unit external output 2",
+        "EventCode": "0x4012",
+        "EventName": "TRCEXTOUT2",
+        "BriefDescription": "PE Trace Unit external output 2"
+    },
+    {
+        "PublicDescription": "PE Trace Unit external output 3",
+        "EventCode": "0x4013",
+        "EventName": "TRCEXTOUT3",
+        "BriefDescription": "PE Trace Unit external output 3"
+    },
+    {
+        "PublicDescription": "Cross-trigger Interface output trigger 4",
+        "EventCode": "0x4018",
+        "EventName": "CTI_TRIGOUT4",
+        "BriefDescription": "Cross-trigger Interface output trigger 4"
+    },
+    {
+        "PublicDescription": "Cross-trigger Interface output trigger 5 ",
+        "EventCode": "0x4019",
+        "EventName": "CTI_TRIGOUT5",
+        "BriefDescription": "Cross-trigger Interface output trigger 5 "
+    },
+    {
+        "PublicDescription": "Cross-trigger Interface output trigger 6",
+        "EventCode": "0x401A",
+        "EventName": "CTI_TRIGOUT6",
+        "BriefDescription": "Cross-trigger Interface output trigger 6"
+    },
+    {
+        "PublicDescription": "Cross-trigger Interface output trigger 7",
+        "EventCode": "0x401B",
+        "EventName": "CTI_TRIGOUT7",
+        "BriefDescription": "Cross-trigger Interface output trigger 7"
+    },
+    {
+        "PublicDescription": "Access with additional latency from alignment",
+        "EventCode": "0x4020",
+        "EventName": "LDST_ALIGN_LAT",
+        "BriefDescription": "Access with additional latency from alignment"
+    },
+    {
+        "PublicDescription": "Load with additional latency from alignment",
+        "EventCode": "0x4021",
+        "EventName": "LD_ALIGN_LAT",
+        "BriefDescription": "Load with additional latency from alignment"
+    },
+    {
+        "PublicDescription": "Store with additional latency from alignment",
+        "EventCode": "0x4022",
+        "EventName": "ST_ALIGN_LAT",
+        "BriefDescription": "Store with additional latency from alignment"
+    },
+    {
+        "PublicDescription": "Checked data memory access",
+        "EventCode": "0x4024",
+        "EventName": "MEM_ACCESS_CHECKED",
+        "BriefDescription": "Checked data memory access"
+    },
+    {
+        "PublicDescription": "Checked data memory access, read",
+        "EventCode": "0x4025",
+        "EventName": "MEM_ACCESS_CHECKED_RD",
+        "BriefDescription": "Checked data memory access, read"
+    },
+    {
+        "PublicDescription": "Checked data memory access, write",
+        "EventCode": "0x4026",
+        "EventName": "MEM_ACCESS_CHECKED_WR",
+        "BriefDescription": "Checked data memory access, write"
+    },
+    {
+        "PublicDescription": "SIMD Instruction architecturally executed.",
+        "EventCode": "0x8000",
+        "EventName": "SIMD_INST_RETIRED",
+        "BriefDescription": "SIMD Instruction architecturally executed."
+    },
+    {
+        "PublicDescription": "Instruction architecturally executed, SVE.",
+        "EventCode": "0x8002",
+        "EventName": "SVE_INST_RETIRED",
+        "BriefDescription": "Instruction architecturally executed, SVE."
+    },
+    {
+        "PublicDescription": "ASE operations speculatively executed",
+        "EventCode": "0x8005",
+        "EventName": "ASE_INST_SPEC",
+        "BriefDescription": "ASE operations speculatively executed"
+    },
+    {
+        "PublicDescription": "SVE operations speculatively executed",
+        "EventCode": "0x8006",
+        "EventName": "SVE_INST_SPEC",
+        "BriefDescription": "SVE operations speculatively executed"
+    },
+    {
+        "PublicDescription": "Microarchitectural operation, Operations speculatively executed.",
+        "EventCode": "0x8008",
+        "EventName": "UOP_SPEC",
+        "BriefDescription": "Microarchitectural operation, Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE Math accelerator Operations speculatively executed.",
+        "EventCode": "0x800E",
+        "EventName": "SVE_MATH_SPEC",
+        "BriefDescription": "SVE Math accelerator Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Floating-point Operations speculatively executed.",
+        "EventCode": "0x8010",
+        "EventName": "FP_SPEC",
+        "BriefDescription": "Floating-point Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Floating-point half-precision operations speculatively executed",
+        "EventCode": "0x8014",
+        "EventName": "FP_HP_SPEC",
+        "BriefDescription": "Floating-point half-precision operations speculatively executed"
+    },
+    {
+        "PublicDescription": "Floating-point single-precision operations speculatively executed",
+        "EventCode": "0x8018",
+        "EventName": "FP_SP_SPEC",
+        "BriefDescription": "Floating-point single-precision operations speculatively executed"
+    },
+    {
+        "PublicDescription": "Floating-point double-precision operations speculatively executed",
+        "EventCode": "0x801C",
+        "EventName": "FP_DP_SPEC",
+        "BriefDescription": "Floating-point double-precision operations speculatively executed"
+    },
+    {
+        "PublicDescription": "Floating-point FMA Operations speculatively executed.",
+        "EventCode": "0x8028",
+        "EventName": "FP_FMA_SPEC",
+        "BriefDescription": "Floating-point FMA Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Floating-point reciprocal estimate Operations speculatively executed.",
+        "EventCode": "0x8034",
+        "EventName": "FP_RECPE_SPEC",
+        "BriefDescription": "Floating-point reciprocal estimate Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "floating-point convert Operations speculatively executed.",
+        "EventCode": "0x8038",
+        "EventName": "FP_CVT_SPEC",
+        "BriefDescription": "floating-point convert Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Advanced SIMD and SVE integer Operations speculatively executed.",
+        "EventCode": "0x8043",
+        "EventName": "ASE_SVE_INT_SPEC",
+        "BriefDescription": "Advanced SIMD and SVE integer Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE predicated Operations speculatively executed.",
+        "EventCode": "0x8074",
+        "EventName": "SVE_PRED_SPEC",
+        "BriefDescription": "SVE predicated Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE predicated operations with no active predicates speculatively executed",
+        "EventCode": "0x8075",
+        "EventName": "SVE_PRED_EMPTY_SPEC",
+        "BriefDescription": "SVE predicated operations with no active predicates speculatively executed"
+    },
+    {
+        "PublicDescription": "SVE predicated operations speculatively executed with all active predicates",
+        "EventCode": "0x8076",
+        "EventName": "SVE_PRED_FULL_SPEC",
+        "BriefDescription": "SVE predicated operations speculatively executed with all active predicates"
+    },
+    {
+        "PublicDescription": "SVE predicated operations speculatively executed with partially active predicates",
+        "EventCode": "0x8077",
+        "EventName": "SVE_PRED_PARTIAL_SPEC",
+        "BriefDescription": "SVE predicated operations speculatively executed with partially active predicates"
+    },
+    {
+        "PublicDescription": "SVE predicated operations with empty or partially active predicates",
+        "EventCode": "0x8079",
+        "EventName": "SVE_PRED_NOT_FULL_SPEC",
+        "BriefDescription": "SVE predicated operations with empty or partially active predicates"
+    },
+    {
+        "PublicDescription": "SVE MOVPRFX Operations speculatively executed.",
+        "EventCode": "0x807C",
+        "EventName": "SVE_MOVPRFX_SPEC",
+        "BriefDescription": "SVE MOVPRFX Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE MOVPRFX unfused Operations speculatively executed.",
+        "EventCode": "0x807F",
+        "EventName": "SVE_MOVPRFX_U_SPEC",
+        "BriefDescription": "SVE MOVPRFX unfused Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Advanced SIMD and SVE load Operations speculatively executed.",
+        "EventCode": "0x8085",
+        "EventName": "ASE_SVE_LD_SPEC",
+        "BriefDescription": "Advanced SIMD and SVE load Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Advanced SIMD and SVE store Operations speculatively executed.",
+        "EventCode": "0x8086",
+        "EventName": "ASE_SVE_ST_SPEC",
+        "BriefDescription": "Advanced SIMD and SVE store Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Prefetch Operations speculatively executed.",
+        "EventCode": "0x8087",
+        "EventName": "PRF_SPEC",
+        "BriefDescription": "Prefetch Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "General-purpose register load Operations speculatively executed.",
+        "EventCode": "0x8089",
+        "EventName": "BASE_LD_REG_SPEC",
+        "BriefDescription": "General-purpose register load Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "General-purpose register store Operations speculatively executed.",
+        "EventCode": "0x808A",
+        "EventName": "BASE_ST_REG_SPEC",
+        "BriefDescription": "General-purpose register store Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE unpredicated load register Operations speculatively executed.",
+        "EventCode": "0x8091",
+        "EventName": "SVE_LDR_REG_SPEC",
+        "BriefDescription": "SVE unpredicated load register Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE unpredicated store register Operations speculatively executed.",
+        "EventCode": "0x8092",
+        "EventName": "SVE_STR_REG_SPEC",
+        "BriefDescription": "SVE unpredicated store register Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE load predicate register Operations speculatively executed.",
+        "EventCode": "0x8095",
+        "EventName": "SVE_LDR_PREG_SPEC",
+        "BriefDescription": "SVE load predicate register Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE store predicate register Operations speculatively executed.",
+        "EventCode": "0x8096",
+        "EventName": "SVE_STR_PREG_SPEC",
+        "BriefDescription": "SVE store predicate register Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE contiguous prefetch element Operations speculatively executed.",
+        "EventCode": "0x809F",
+        "EventName": "SVE_PRF_CONTIG_SPEC",
+        "BriefDescription": "SVE contiguous prefetch element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Advanced SIMD and SVE contiguous load multiple vector Operations speculatively executed.",
+        "EventCode": "0x80A5",
+        "EventName": "ASE_SVE_LD_MULTI_SPEC",
+        "BriefDescription": "Advanced SIMD and SVE contiguous load multiple vector Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Advanced SIMD and SVE contiguous store multiple vector Operations speculatively executed.",
+        "EventCode": "0x80A6",
+        "EventName": "ASE_SVE_ST_MULTI_SPEC",
+        "BriefDescription": "Advanced SIMD and SVE contiguous store multiple vector Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE gather-load Operations speculatively executed.",
+        "EventCode": "0x80AD",
+        "EventName": "SVE_LD_GATHER_SPEC",
+        "BriefDescription": "SVE gather-load Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE scatter-store Operations speculatively executed.",
+        "EventCode": "0x80AE",
+        "EventName": "SVE_ST_SCATTER_SPEC",
+        "BriefDescription": "SVE scatter-store Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE gather-prefetch Operations speculatively executed.",
+        "EventCode": "0x80AF",
+        "EventName": "SVE_PRF_GATHER_SPEC",
+        "BriefDescription": "SVE gather-prefetch Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE First-fault load Operations speculatively executed.",
+        "EventCode": "0x80BC",
+        "EventName": "SVE_LDFF_SPEC",
+        "BriefDescription": "SVE First-fault load Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE first-fault load operations speculatively executed which set FFR bit to 0",
+        "EventCode": "0x80BD",
+        "EventName": "SVE_LDFF_FAULT_SPEC",
+        "BriefDescription": "SVE first-fault load operations speculatively executed which set FFR bit to 0"
+    },
+    {
+        "PublicDescription": "Scalable floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C0",
+        "EventName": "FP_SCALE_OPS_SPEC",
+        "BriefDescription": "Scalable floating-point element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Non-scalable floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C1",
+        "EventName": "FP_FIXED_OPS_SPEC",
+        "BriefDescription": "Non-scalable floating-point element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Scalable half-precision floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C2",
+        "EventName": "FP_HP_SCALE_OPS_SPEC",
+        "BriefDescription": "Scalable half-precision floating-point element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Non-scalable half-precision floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C3",
+        "EventName": "FP_HP_FIXED_OPS_SPEC",
+        "BriefDescription": "Non-scalable half-precision floating-point element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Scalable single-precision floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C4",
+        "EventName": "FP_SP_SCALE_OPS_SPEC",
+        "BriefDescription": "Scalable single-precision floating-point element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Non-scalable single-precision floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C5",
+        "EventName": "FP_SP_FIXED_OPS_SPEC",
+        "BriefDescription": "Non-scalable single-precision floating-point element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Scalable double-precision floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C6",
+        "EventName": "FP_DP_SCALE_OPS_SPEC",
+        "BriefDescription": "Scalable double-precision floating-point element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Non-scalable double-precision floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C7",
+        "EventName": "FP_DP_FIXED_OPS_SPEC",
+        "BriefDescription": "Non-scalable double-precision floating-point element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Advanced SIMD and SVE 8-bit integer operations speculatively executed",
+        "EventCode": "0x80E3",
+        "EventName": "ASE_SVE_INT8_SPEC",
+        "BriefDescription": "Advanced SIMD and SVE 8-bit integer operations speculatively executed"
+    },
+    {
+        "PublicDescription": "Advanced SIMD and SVE 16-bit integer operations speculatively executed",
+        "EventCode": "0x80E7",
+        "EventName": "ASE_SVE_INT16_SPEC",
+        "BriefDescription": "Advanced SIMD and SVE 16-bit integer operations speculatively executed"
+    },
+    {
+        "PublicDescription": "Advanced SIMD and SVE 32-bit integer operations speculatively executed",
+        "EventCode": "0x80EB",
+        "EventName": "ASE_SVE_INT32_SPEC",
+        "BriefDescription": "Advanced SIMD and SVE 32-bit integer operations speculatively executed"
+    },
+    {
+        "PublicDescription": "Advanced SIMD and SVE 64-bit integer operations speculatively executed",
+        "EventCode": "0x80EF",
+        "EventName": "ASE_SVE_INT64_SPEC",
+        "BriefDescription": "Advanced SIMD and SVE 64-bit integer operations speculatively executed"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/recommended.json b/tools/perf/pmu-events/arch/arm64/recommended.json
new file mode 100644
index 000000000000..210afa856091
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/recommended.json
@@ -0,0 +1,452 @@
+[
+    {
+        "PublicDescription": "Attributable Level 1 data cache access, read",
+        "EventCode": "0x40",
+        "EventName": "L1D_CACHE_RD",
+        "BriefDescription": "L1D cache access, read"
+    },
+    {
+        "PublicDescription": "Attributable Level 1 data cache access, write",
+        "EventCode": "0x41",
+        "EventName": "L1D_CACHE_WR",
+        "BriefDescription": "L1D cache access, write"
+    },
+    {
+        "PublicDescription": "Attributable Level 1 data cache refill, read",
+        "EventCode": "0x42",
+        "EventName": "L1D_CACHE_REFILL_RD",
+        "BriefDescription": "L1D cache refill, read"
+    },
+    {
+        "PublicDescription": "Attributable Level 1 data cache refill, write",
+        "EventCode": "0x43",
+        "EventName": "L1D_CACHE_REFILL_WR",
+        "BriefDescription": "L1D cache refill, write"
+    },
+    {
+        "PublicDescription": "Attributable Level 1 data cache refill, inner",
+        "EventCode": "0x44",
+        "EventName": "L1D_CACHE_REFILL_INNER",
+        "BriefDescription": "L1D cache refill, inner"
+    },
+    {
+        "PublicDescription": "Attributable Level 1 data cache refill, outer",
+        "EventCode": "0x45",
+        "EventName": "L1D_CACHE_REFILL_OUTER",
+        "BriefDescription": "L1D cache refill, outer"
+    },
+    {
+        "PublicDescription": "Attributable Level 1 data cache Write-Back, victim",
+        "EventCode": "0x46",
+        "EventName": "L1D_CACHE_WB_VICTIM",
+        "BriefDescription": "L1D cache Write-Back, victim"
+    },
+    {
+        "PublicDescription": "Level 1 data cache Write-Back, cleaning and coherency",
+        "EventCode": "0x47",
+        "EventName": "L1D_CACHE_WB_CLEAN",
+        "BriefDescription": "L1D cache Write-Back, cleaning and coherency"
+    },
+    {
+        "PublicDescription": "Attributable Level 1 data cache invalidate",
+        "EventCode": "0x48",
+        "EventName": "L1D_CACHE_INVAL",
+        "BriefDescription": "L1D cache invalidate"
+    },
+    {
+        "PublicDescription": "Attributable Level 1 data TLB refill, read",
+        "EventCode": "0x4C",
+        "EventName": "L1D_TLB_REFILL_RD",
+        "BriefDescription": "L1D tlb refill, read"
+    },
+    {
+        "PublicDescription": "Attributable Level 1 data TLB refill, write",
+        "EventCode": "0x4D",
+        "EventName": "L1D_TLB_REFILL_WR",
+        "BriefDescription": "L1D tlb refill, write"
+    },
+    {
+        "PublicDescription": "Attributable Level 1 data or unified TLB access, read",
+        "EventCode": "0x4E",
+        "EventName": "L1D_TLB_RD",
+        "BriefDescription": "L1D tlb access, read"
+    },
+    {
+        "PublicDescription": "Attributable Level 1 data or unified TLB access, write",
+        "EventCode": "0x4F",
+        "EventName": "L1D_TLB_WR",
+        "BriefDescription": "L1D tlb access, write"
+    },
+    {
+        "PublicDescription": "Attributable Level 2 data cache access, read",
+        "EventCode": "0x50",
+        "EventName": "L2D_CACHE_RD",
+        "BriefDescription": "L2D cache access, read"
+    },
+    {
+        "PublicDescription": "Attributable Level 2 data cache access, write",
+        "EventCode": "0x51",
+        "EventName": "L2D_CACHE_WR",
+        "BriefDescription": "L2D cache access, write"
+    },
+    {
+        "PublicDescription": "Attributable Level 2 data cache refill, read",
+        "EventCode": "0x52",
+        "EventName": "L2D_CACHE_REFILL_RD",
+        "BriefDescription": "L2D cache refill, read"
+    },
+    {
+        "PublicDescription": "Attributable Level 2 data cache refill, write",
+        "EventCode": "0x53",
+        "EventName": "L2D_CACHE_REFILL_WR",
+        "BriefDescription": "L2D cache refill, write"
+    },
+    {
+        "PublicDescription": "Attributable Level 2 data cache Write-Back, victim",
+        "EventCode": "0x56",
+        "EventName": "L2D_CACHE_WB_VICTIM",
+        "BriefDescription": "L2D cache Write-Back, victim"
+    },
+    {
+        "PublicDescription": "Level 2 data cache Write-Back, cleaning and coherency",
+        "EventCode": "0x57",
+        "EventName": "L2D_CACHE_WB_CLEAN",
+        "BriefDescription": "L2D cache Write-Back, cleaning and coherency"
+    },
+    {
+        "PublicDescription": "Attributable Level 2 data cache invalidate",
+        "EventCode": "0x58",
+        "EventName": "L2D_CACHE_INVAL",
+        "BriefDescription": "L2D cache invalidate"
+    },
+    {
+        "PublicDescription": "Attributable Level 2 data or unified TLB refill, read",
+        "EventCode": "0x5c",
+        "EventName": "L2D_TLB_REFILL_RD",
+        "BriefDescription": "L2D cache refill, read"
+    },
+    {
+        "PublicDescription": "Attributable Level 2 data or unified TLB refill, write",
+        "EventCode": "0x5d",
+        "EventName": "L2D_TLB_REFILL_WR",
+        "BriefDescription": "L2D cache refill, write"
+    },
+    {
+        "PublicDescription": "Attributable Level 2 data or unified TLB access, read",
+        "EventCode": "0x5e",
+        "EventName": "L2D_TLB_RD",
+        "BriefDescription": "L2D cache access, read"
+    },
+    {
+        "PublicDescription": "Attributable Level 2 data or unified TLB access, write",
+        "EventCode": "0x5f",
+        "EventName": "L2D_TLB_WR",
+        "BriefDescription": "L2D cache access, write"
+    },
+    {
+        "PublicDescription": "Bus access read",
+        "EventCode": "0x60",
+        "EventName": "BUS_ACCESS_RD",
+        "BriefDescription": "Bus access read"
+    },
+    {
+        "PublicDescription": "Bus access write",
+        "EventCode": "0x61",
+        "EventName": "BUS_ACCESS_WR",
+        "BriefDescription": "Bus access write"
+    },
+    {
+        "PublicDescription": "Bus access, Normal, Cacheable, Shareable",
+        "EventCode": "0x62",
+        "EventName": "BUS_ACCESS_SHARED",
+        "BriefDescription": "Bus access, Normal, Cacheable, Shareable"
+    },
+    {
+        "PublicDescription": "Bus access, not Normal, Cacheable, Shareable",
+        "EventCode": "0x63",
+        "EventName": "BUS_ACCESS_NOT_SHARED",
+        "BriefDescription": "Bus access, not Normal, Cacheable, Shareable"
+    },
+    {
+        "PublicDescription": "Bus access, Normal",
+        "EventCode": "0x64",
+        "EventName": "BUS_ACCESS_NORMAL",
+        "BriefDescription": "Bus access, Normal"
+    },
+    {
+        "PublicDescription": "Bus access, peripheral",
+        "EventCode": "0x65",
+        "EventName": "BUS_ACCESS_PERIPH",
+        "BriefDescription": "Bus access, peripheral"
+    },
+    {
+        "PublicDescription": "Data memory access, read",
+        "EventCode": "0x66",
+        "EventName": "MEM_ACCESS_RD",
+        "BriefDescription": "Data memory access, read"
+    },
+    {
+        "PublicDescription": "Data memory access, write",
+        "EventCode": "0x67",
+        "EventName": "MEM_ACCESS_WR",
+        "BriefDescription": "Data memory access, write"
+    },
+    {
+        "PublicDescription": "Unaligned access, read",
+        "EventCode": "0x68",
+        "EventName": "UNALIGNED_LD_SPEC",
+        "BriefDescription": "Unaligned access, read"
+    },
+    {
+        "PublicDescription": "Unaligned access, write",
+        "EventCode": "0x69",
+        "EventName": "UNALIGNED_ST_SPEC",
+        "BriefDescription": "Unaligned access, write"
+    },
+    {
+        "PublicDescription": "Unaligned access",
+        "EventCode": "0x6a",
+        "EventName": "UNALIGNED_LDST_SPEC",
+        "BriefDescription": "Unaligned access"
+    },
+    {
+        "PublicDescription": "Exclusive operation speculatively executed, LDREX or LDX",
+        "EventCode": "0x6c",
+        "EventName": "LDREX_SPEC",
+        "BriefDescription": "Exclusive operation speculatively executed, LDREX or LDX"
+    },
+    {
+        "PublicDescription": "Exclusive operation speculatively executed, STREX or STX pass",
+        "EventCode": "0x6d",
+        "EventName": "STREX_PASS_SPEC",
+        "BriefDescription": "Exclusive operation speculatively executed, STREX or STX pass"
+    },
+    {
+        "PublicDescription": "Exclusive operation speculatively executed, STREX or STX fail",
+        "EventCode": "0x6e",
+        "EventName": "STREX_FAIL_SPEC",
+        "BriefDescription": "Exclusive operation speculatively executed, STREX or STX fail"
+    },
+    {
+        "PublicDescription": "Exclusive operation speculatively executed, STREX or STX",
+        "EventCode": "0x6f",
+        "EventName": "STREX_SPEC",
+        "BriefDescription": "Exclusive operation speculatively executed, STREX or STX"
+    },
+    {
+        "PublicDescription": "Operation speculatively executed, load",
+        "EventCode": "0x70",
+        "EventName": "LD_SPEC",
+        "BriefDescription": "Operation speculatively executed, load"
+    },
+    {
+        "PublicDescription": "Operation speculatively executed, store",
+        "EventCode": "0x71",
+        "EventName": "ST_SPEC",
+        "BriefDescription": "Operation speculatively executed, store"
+    },
+    {
+        "PublicDescription": "Operation speculatively executed, load or store",
+        "EventCode": "0x72",
+        "EventName": "LDST_SPEC",
+        "BriefDescription": "Operation speculatively executed, load or store"
+    },
+    {
+        "PublicDescription": "Operation speculatively executed, integer data processing",
+        "EventCode": "0x73",
+        "EventName": "DP_SPEC",
+        "BriefDescription": "Operation speculatively executed, integer data processing"
+    },
+    {
+        "PublicDescription": "Operation speculatively executed, Advanced SIMD instruction",
+        "EventCode": "0x74",
+        "EventName": "ASE_SPEC",
+        "BriefDescription": "Operation speculatively executed, Advanced SIMD instruction"
+    },
+    {
+        "PublicDescription": "Operation speculatively executed, floating-point instruction",
+        "EventCode": "0x75",
+        "EventName": "VFP_SPEC",
+        "BriefDescription": "Operation speculatively executed, floating-point instruction"
+    },
+    {
+        "PublicDescription": "Operation speculatively executed, software change of the PC",
+        "EventCode": "0x76",
+        "EventName": "PC_WRITE_SPEC",
+        "BriefDescription": "Operation speculatively executed, software change of the PC"
+    },
+    {
+        "PublicDescription": "Operation speculatively executed, Cryptographic instruction",
+        "EventCode": "0x77",
+        "EventName": "CRYPTO_SPEC",
+        "BriefDescription": "Operation speculatively executed, Cryptographic instruction"
+    },
+    {
+        "PublicDescription": "Branch speculatively executed, immediate branch",
+        "EventCode": "0x78",
+        "EventName": "BR_IMMED_SPEC",
+        "BriefDescription": "Branch speculatively executed, immediate branch"
+    },
+    {
+        "PublicDescription": "Branch speculatively executed, procedure return",
+        "EventCode": "0x79",
+        "EventName": "BR_RETURN_SPEC",
+        "BriefDescription": "Branch speculatively executed, procedure return"
+    },
+    {
+        "PublicDescription": "Branch speculatively executed, indirect branch",
+        "EventCode": "0x7a",
+        "EventName": "BR_INDIRECT_SPEC",
+        "BriefDescription": "Branch speculatively executed, indirect branch"
+    },
+    {
+        "PublicDescription": "Barrier speculatively executed, ISB",
+        "EventCode": "0x7c",
+        "EventName": "ISB_SPEC",
+        "BriefDescription": "Barrier speculatively executed, ISB"
+    },
+    {
+        "PublicDescription": "Barrier speculatively executed, DSB",
+        "EventCode": "0x7d",
+        "EventName": "DSB_SPEC",
+        "BriefDescription": "Barrier speculatively executed, DSB"
+    },
+    {
+        "PublicDescription": "Barrier speculatively executed, DMB",
+        "EventCode": "0x7e",
+        "EventName": "DMB_SPEC",
+        "BriefDescription": "Barrier speculatively executed, DMB"
+    },
+    {
+        "PublicDescription": "Exception taken, Other synchronous",
+        "EventCode": "0x81",
+        "EventName": "EXC_UNDEF",
+        "BriefDescription": "Exception taken, Other synchronous"
+    },
+    {
+        "PublicDescription": "Exception taken, Supervisor Call",
+        "EventCode": "0x82",
+        "EventName": "EXC_SVC",
+        "BriefDescription": "Exception taken, Supervisor Call"
+    },
+    {
+        "PublicDescription": "Exception taken, Instruction Abort",
+        "EventCode": "0x83",
+        "EventName": "EXC_PABORT",
+        "BriefDescription": "Exception taken, Instruction Abort"
+    },
+    {
+        "PublicDescription": "Exception taken, Data Abort and SError",
+        "EventCode": "0x84",
+        "EventName": "EXC_DABORT",
+        "BriefDescription": "Exception taken, Data Abort and SError"
+    },
+    {
+        "PublicDescription": "Exception taken, IRQ",
+        "EventCode": "0x86",
+        "EventName": "EXC_IRQ",
+        "BriefDescription": "Exception taken, IRQ"
+    },
+    {
+        "PublicDescription": "Exception taken, FIQ",
+        "EventCode": "0x87",
+        "EventName": "EXC_FIQ",
+        "BriefDescription": "Exception taken, FIQ"
+    },
+    {
+        "PublicDescription": "Exception taken, Secure Monitor Call",
+        "EventCode": "0x88",
+        "EventName": "EXC_SMC",
+        "BriefDescription": "Exception taken, Secure Monitor Call"
+    },
+    {
+        "PublicDescription": "Exception taken, Hypervisor Call",
+        "EventCode": "0x8a",
+        "EventName": "EXC_HVC",
+        "BriefDescription": "Exception taken, Hypervisor Call"
+    },
+    {
+        "PublicDescription": "Exception taken, Instruction Abort not taken locally",
+        "EventCode": "0x8b",
+        "EventName": "EXC_TRAP_PABORT",
+        "BriefDescription": "Exception taken, Instruction Abort not taken locally"
+    },
+    {
+        "PublicDescription": "Exception taken, Data Abort or SError not taken locally",
+        "EventCode": "0x8c",
+        "EventName": "EXC_TRAP_DABORT",
+        "BriefDescription": "Exception taken, Data Abort or SError not taken locally"
+    },
+    {
+        "PublicDescription": "Exception taken, Other traps not taken locally",
+        "EventCode": "0x8d",
+        "EventName": "EXC_TRAP_OTHER",
+        "BriefDescription": "Exception taken, Other traps not taken locally"
+    },
+    {
+        "PublicDescription": "Exception taken, IRQ not taken locally",
+        "EventCode": "0x8e",
+        "EventName": "EXC_TRAP_IRQ",
+        "BriefDescription": "Exception taken, IRQ not taken locally"
+    },
+    {
+        "PublicDescription": "Exception taken, FIQ not taken locally",
+        "EventCode": "0x8f",
+        "EventName": "EXC_TRAP_FIQ",
+        "BriefDescription": "Exception taken, FIQ not taken locally"
+    },
+    {
+        "PublicDescription": "Release consistency operation speculatively executed, Load-Acquire",
+        "EventCode": "0x90",
+        "EventName": "RC_LD_SPEC",
+        "BriefDescription": "Release consistency operation speculatively executed, Load-Acquire"
+    },
+    {
+        "PublicDescription": "Release consistency operation speculatively executed, Store-Release",
+        "EventCode": "0x91",
+        "EventName": "RC_ST_SPEC",
+        "BriefDescription": "Release consistency operation speculatively executed, Store-Release"
+    },
+    {
+        "PublicDescription": "Attributable Level 3 data or unified cache access, read",
+        "EventCode": "0xa0",
+        "EventName": "L3D_CACHE_RD",
+        "BriefDescription": "Attributable Level 3 data or unified cache access, read"
+    },
+    {
+        "PublicDescription": "Attributable Level 3 data or unified cache access, write",
+        "EventCode": "0xa1",
+        "EventName": "L3D_CACHE_WR",
+        "BriefDescription": "Attributable Level 3 data or unified cache access, write"
+    },
+    {
+        "PublicDescription": "Attributable Level 3 data or unified cache refill, read",
+        "EventCode": "0xa2",
+        "EventName": "L3D_CACHE_REFILL_RD",
+        "BriefDescription": "Attributable Level 3 data or unified cache refill, read"
+    },
+    {
+        "PublicDescription": "Attributable Level 3 data or unified cache refill, write",
+        "EventCode": "0xa3",
+        "EventName": "L3D_CACHE_REFILL_WR",
+        "BriefDescription": "Attributable Level 3 data or unified cache refill, write"
+    },
+    {
+        "PublicDescription": "Attributable Level 3 data or unified cache Write-Back, victim",
+        "EventCode": "0xa6",
+        "EventName": "L3D_CACHE_WB_VICTIM",
+        "BriefDescription": "Attributable Level 3 data or unified cache Write-Back, victim"
+    },
+    {
+        "PublicDescription": "Attributable Level 3 data or unified cache Write-Back, cache clean",
+        "EventCode": "0xa7",
+        "EventName": "L3D_CACHE_WB_CLEAN",
+        "BriefDescription": "Attributable Level 3 data or unified cache Write-Back, cache clean"
+    },
+    {
+        "PublicDescription": "Attributable Level 3 data or unified cache access, invalidate",
+        "EventCode": "0xa8",
+        "EventName": "L3D_CACHE_INVAL",
+        "BriefDescription": "Attributable Level 3 data or unified cache access, invalidate"
+    }
+]
-- 
cgit v1.2.3


From 7cc9680c4be7c1da7a3570711f01273b781b936b Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Wed, 8 Dec 2021 11:54:35 +0000
Subject: perf cs-etm: Remove duplicate and incorrect aux size checks

There are two checks, one is for size when running without admin, but
this one is covered by the driver and reported on in more detail here
(builtin-record.c):

  pr_err("Permission error mapping pages.\n"
         "Consider increasing "
         "/proc/sys/kernel/perf_event_mlock_kb,\n"
         "or try again with a smaller value of -m/--mmap_pages.\n"
         "(current value: %u,%u)\n",

This had the effect of artificially limiting the aux buffer size to a
value smaller than what was allowed because perf_event_mlock_kb wasn't
taken into account.

The second is to check for a power of two, but this is covered here
(evlist.c):

  pr_info("rounding mmap pages size to %s (%lu pages)\n",
          buf, pages);

Reviewed-by: Leo Yan <leo.yan@linaro.org>
Signed-off-by: James Clark <james.clark@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20211208115435.610101-1-james.clark@arm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/arm/util/cs-etm.c | 19 -------------------
 1 file changed, 19 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c
index 293a23bf8be3..8a3d54a86c9c 100644
--- a/tools/perf/arch/arm/util/cs-etm.c
+++ b/tools/perf/arch/arm/util/cs-etm.c
@@ -407,25 +407,6 @@ static int cs_etm_recording_options(struct auxtrace_record *itr,
 
 	}
 
-	/* Validate auxtrace_mmap_pages provided by user */
-	if (opts->auxtrace_mmap_pages) {
-		unsigned int max_page = (KiB(128) / page_size);
-		size_t sz = opts->auxtrace_mmap_pages * (size_t)page_size;
-
-		if (!privileged &&
-		    opts->auxtrace_mmap_pages > max_page) {
-			opts->auxtrace_mmap_pages = max_page;
-			pr_err("auxtrace too big, truncating to %d\n",
-			       max_page);
-		}
-
-		if (!is_power_of_2(sz)) {
-			pr_err("Invalid mmap size for %s: must be a power of 2\n",
-			       CORESIGHT_ETM_PMU_NAME);
-			return -EINVAL;
-		}
-	}
-
 	if (opts->auxtrace_snapshot_mode)
 		pr_debug2("%s snapshot size: %zu\n", CORESIGHT_ETM_PMU_NAME,
 			  opts->auxtrace_snapshot_size);
-- 
cgit v1.2.3


From 8acf3793eae4d809658b1ebeed68d818d6d38142 Mon Sep 17 00:00:00 2001
From: Miaoqian Lin <linmq006@gmail.com>
Date: Sun, 12 Dec 2021 13:56:09 +0000
Subject: perf bpf-loader: Use IS_ERR_OR_NULL() to clean code and fix check

Use IS_ERR_OR_NULL() to make the code cleaner.
Also if the priv is NULL, it's improper to call PTR_ERR(priv).

Signed-off-by: Miaoqian Lin <linmq006@gmail.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: Yonghong Song <yhs@fb.com>
Cc: bpf@vger.kernel.org
Cc: netdev@vger.kernel.org
Cc: unlisted-recipients
Link: http://lore.kernel.org/lkml/20211212135613.20000-1-linmq006@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf-loader.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index fbb3c4057c30..22662fc85cc9 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -421,7 +421,7 @@ preproc_gen_prologue(struct bpf_program *prog, int n,
 	size_t prologue_cnt = 0;
 	int i, err;
 
-	if (IS_ERR(priv) || !priv || priv->is_tp)
+	if (IS_ERR_OR_NULL(priv) || priv->is_tp)
 		goto errout;
 
 	pev = &priv->pev;
@@ -570,7 +570,7 @@ static int hook_load_preprocessor(struct bpf_program *prog)
 	bool need_prologue = false;
 	int err, i;
 
-	if (IS_ERR(priv) || !priv) {
+	if (IS_ERR_OR_NULL(priv)) {
 		pr_debug("Internal error when hook preprocessor\n");
 		return -BPF_LOADER_ERRNO__INTERNAL;
 	}
@@ -642,8 +642,11 @@ int bpf__probe(struct bpf_object *obj)
 			goto out;
 
 		priv = bpf_program__priv(prog);
-		if (IS_ERR(priv) || !priv) {
-			err = PTR_ERR(priv);
+		if (IS_ERR_OR_NULL(priv)) {
+			if (!priv)
+				err = -BPF_LOADER_ERRNO__INTERNAL;
+			else
+				err = PTR_ERR(priv);
 			goto out;
 		}
 
@@ -693,7 +696,7 @@ int bpf__unprobe(struct bpf_object *obj)
 		struct bpf_prog_priv *priv = bpf_program__priv(prog);
 		int i;
 
-		if (IS_ERR(priv) || !priv || priv->is_tp)
+		if (IS_ERR_OR_NULL(priv) || priv->is_tp)
 			continue;
 
 		for (i = 0; i < priv->pev.ntevs; i++) {
@@ -751,7 +754,7 @@ int bpf__foreach_event(struct bpf_object *obj,
 		struct perf_probe_event *pev;
 		int i, fd;
 
-		if (IS_ERR(priv) || !priv) {
+		if (IS_ERR_OR_NULL(priv)) {
 			pr_debug("bpf: failed to get private field\n");
 			return -BPF_LOADER_ERRNO__INTERNAL;
 		}
-- 
cgit v1.2.3


From 017f7d1fac1c40eba6d97490a75bc1999914ef75 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Sun, 12 Dec 2021 22:21:22 +0000
Subject: libperf tests: Fix a spelling mistake "Runnnig" -> "Running"

There is a spelling mistake in a __T_VERBOSE message. Fix it.

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: kernel-janitors@vger.kernel.org
Link: http://lore.kernel.org/lkml/20211212222122.478537-1-colin.i.king@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/perf/tests/test-evlist.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/perf/tests/test-evlist.c b/tools/lib/perf/tests/test-evlist.c
index 520a78267743..e7afff12c35a 100644
--- a/tools/lib/perf/tests/test-evlist.c
+++ b/tools/lib/perf/tests/test-evlist.c
@@ -535,7 +535,7 @@ static int test_stat_multiplexing(void)
 				    (double)counts[i].run / (double)counts[i].ena * 100.0,
 				    counts[i].run, counts[i].ena);
 		} else if (scaled == -1) {
-			__T_VERBOSE("\t Not Runnnig\n");
+			__T_VERBOSE("\t Not Running\n");
 		} else {
 			__T_VERBOSE("\t Not Scaling\n");
 		}
-- 
cgit v1.2.3


From 5d28a17c1c0e9dbd299015f91800aff3466ebedf Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@linaro.org>
Date: Sun, 12 Dec 2021 21:47:20 +0800
Subject: perf namespaces: Add helper nsinfo__is_in_root_namespace()

Refactors code for gathering PID infos, it creates the function
nsinfo__get_nspid() to parse process 'status' node in folder '/proc'.

Base on the refactoring, this patch introduces a new helper
nsinfo__is_in_root_namespace(), it returns true when the caller runs in
the root PID namespace.

Signed-off-by: Leo Yan <leo.yan@linaro.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: John Garry <john.garry@huawei.com>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: Yonatan Goldschmidt <yonatan.goldschmidt@granulate.io>
Cc: Yonghong Song <yhs@fb.com>
Cc: bpf@vger.kernel.org
Cc: netdev@vger.kernel.org
Link: http://lore.kernel.org/lkml/20211212134721.1721245-2-leo.yan@linaro.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/namespaces.c | 76 +++++++++++++++++++++++++++-----------------
 tools/perf/util/namespaces.h |  2 ++
 2 files changed, 48 insertions(+), 30 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/namespaces.c b/tools/perf/util/namespaces.c
index 608b20c72a5c..48aa3217300b 100644
--- a/tools/perf/util/namespaces.c
+++ b/tools/perf/util/namespaces.c
@@ -60,17 +60,49 @@ void namespaces__free(struct namespaces *namespaces)
 	free(namespaces);
 }
 
+static int nsinfo__get_nspid(struct nsinfo *nsi, const char *path)
+{
+	FILE *f = NULL;
+	char *statln = NULL;
+	size_t linesz = 0;
+	char *nspid;
+
+	f = fopen(path, "r");
+	if (f == NULL)
+		return -1;
+
+	while (getline(&statln, &linesz, f) != -1) {
+		/* Use tgid if CONFIG_PID_NS is not defined. */
+		if (strstr(statln, "Tgid:") != NULL) {
+			nsi->tgid = (pid_t)strtol(strrchr(statln, '\t'),
+						     NULL, 10);
+			nsi->nstgid = nsi->tgid;
+		}
+
+		if (strstr(statln, "NStgid:") != NULL) {
+			nspid = strrchr(statln, '\t');
+			nsi->nstgid = (pid_t)strtol(nspid, NULL, 10);
+			/*
+			 * If innermost tgid is not the first, process is in a different
+			 * PID namespace.
+			 */
+			nsi->in_pidns = (statln + sizeof("NStgid:") - 1) != nspid;
+			break;
+		}
+	}
+
+	fclose(f);
+	free(statln);
+	return 0;
+}
+
 int nsinfo__init(struct nsinfo *nsi)
 {
 	char oldns[PATH_MAX];
 	char spath[PATH_MAX];
 	char *newns = NULL;
-	char *statln = NULL;
-	char *nspid;
 	struct stat old_stat;
 	struct stat new_stat;
-	FILE *f = NULL;
-	size_t linesz = 0;
 	int rv = -1;
 
 	if (snprintf(oldns, PATH_MAX, "/proc/self/ns/mnt") >= PATH_MAX)
@@ -100,34 +132,9 @@ int nsinfo__init(struct nsinfo *nsi)
 	if (snprintf(spath, PATH_MAX, "/proc/%d/status", nsi->pid) >= PATH_MAX)
 		goto out;
 
-	f = fopen(spath, "r");
-	if (f == NULL)
-		goto out;
-
-	while (getline(&statln, &linesz, f) != -1) {
-		/* Use tgid if CONFIG_PID_NS is not defined. */
-		if (strstr(statln, "Tgid:") != NULL) {
-			nsi->tgid = (pid_t)strtol(strrchr(statln, '\t'),
-						     NULL, 10);
-			nsi->nstgid = nsi->tgid;
-		}
-
-		if (strstr(statln, "NStgid:") != NULL) {
-			nspid = strrchr(statln, '\t');
-			nsi->nstgid = (pid_t)strtol(nspid, NULL, 10);
-			/* If innermost tgid is not the first, process is in a different
-			 * PID namespace.
-			 */
-			nsi->in_pidns = (statln + sizeof("NStgid:") - 1) != nspid;
-			break;
-		}
-	}
-	rv = 0;
+	rv = nsinfo__get_nspid(nsi, spath);
 
 out:
-	if (f != NULL)
-		(void) fclose(f);
-	free(statln);
 	free(newns);
 	return rv;
 }
@@ -299,3 +306,12 @@ int nsinfo__stat(const char *filename, struct stat *st, struct nsinfo *nsi)
 
 	return ret;
 }
+
+bool nsinfo__is_in_root_namespace(void)
+{
+	struct nsinfo nsi;
+
+	memset(&nsi, 0x0, sizeof(nsi));
+	nsinfo__get_nspid(&nsi, "/proc/self/status");
+	return !nsi.in_pidns;
+}
diff --git a/tools/perf/util/namespaces.h b/tools/perf/util/namespaces.h
index ad9775db7b9c..9ceea9643507 100644
--- a/tools/perf/util/namespaces.h
+++ b/tools/perf/util/namespaces.h
@@ -59,6 +59,8 @@ void nsinfo__mountns_exit(struct nscookie *nc);
 char *nsinfo__realpath(const char *path, struct nsinfo *nsi);
 int nsinfo__stat(const char *filename, struct stat *st, struct nsinfo *nsi);
 
+bool nsinfo__is_in_root_namespace(void);
+
 static inline void __nsinfo__zput(struct nsinfo **nsip)
 {
 	if (nsip) {
-- 
cgit v1.2.3


From d3b58af9a8276c24a2aa80a059d87d99f5216d3b Mon Sep 17 00:00:00 2001
From: German Gomez <german.gomez@arm.com>
Date: Tue, 7 Dec 2021 18:06:51 +0000
Subject: perf arm64: Rename perf_event_arm_regs for ARM64 registers

The registers for ARM and ARM64 are enumerated using two enums that have
the same name. In order to be able to import both headers, the name of
one can be replaced using the C preprocessor like so:

  #define perf_event_arm_regs perf_event_arm64_regs
  #include <asm/perf_regs.h>
  #undef perf_event_arm_regs

This patch updates all imports of ARM64's perf_regs.h in order to
prevent the naming collision.

Reviewed-by: Athira Jajeev <atrajeev@linux.vnet.ibm.com>
Signed-off-by: German Gomez <german.gomez@arm.com>
Tested-by: Athira Jajeev <atrajeev@linux.vnet.ibm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-csky@vger.kernel.org
Cc: linux-riscv@lists.infradead.org
Link: https://lore.kernel.org/r/20211207180653.1147374-3-german.gomez@arm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/arm64/include/perf_regs.h | 2 ++
 tools/perf/util/libunwind/arm64.c         | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/arch/arm64/include/perf_regs.h b/tools/perf/arch/arm64/include/perf_regs.h
index fa3e07459f76..1f0d78b9f23b 100644
--- a/tools/perf/arch/arm64/include/perf_regs.h
+++ b/tools/perf/arch/arm64/include/perf_regs.h
@@ -4,7 +4,9 @@
 
 #include <stdlib.h>
 #include <linux/types.h>
+#define perf_event_arm_regs perf_event_arm64_regs
 #include <asm/perf_regs.h>
+#undef perf_event_arm_regs
 
 void perf_regs_load(u64 *regs);
 
diff --git a/tools/perf/util/libunwind/arm64.c b/tools/perf/util/libunwind/arm64.c
index c397be0c2e32..15f60fd09424 100644
--- a/tools/perf/util/libunwind/arm64.c
+++ b/tools/perf/util/libunwind/arm64.c
@@ -23,7 +23,9 @@
 
 #include "unwind.h"
 #include "libunwind-aarch64.h"
+#define perf_event_arm_regs perf_event_arm64_regs
 #include <../../../../arch/arm64/include/uapi/asm/perf_regs.h>
+#undef perf_event_arm_regs
 #include "../../arch/arm64/util/unwind-libunwind.c"
 
 /* NO_LIBUNWIND_DEBUG_FRAME is a feature flag for local libunwind,
-- 
cgit v1.2.3


From 83869019c74cc2d01c96a3be2463a4eebe362224 Mon Sep 17 00:00:00 2001
From: German Gomez <german.gomez@arm.com>
Date: Tue, 7 Dec 2021 18:06:52 +0000
Subject: perf arch: Support register names from all archs

When reading a perf.data file with register values, there is a mismatch
between the names and the values of the registers because the tool is
built using only the register names from the local architecture.

Reading a perf.data file that was recorded on ARM64, gives the following
erroneous output on an X86 machine:

  # perf report -i perf_arm64.data -D
  [...]
  24661932634451 0x698 [0x21d0]: PERF_RECORD_SAMPLE(IP, 0x1): 43239/43239: 0xffffc5be8f100f98 period: 1 addr: 0
  ... user regs: mask 0x1ffffffff ABI 64-bit
  .... AX    0x0000ffffd1515817
  .... BX    0x0000ffffd1515480
  .... CX    0x0000aaaadabf6c80
  .... DX    0x000000000000002e
  .... SI    0x0000000040100401
  .... DI    0x0040600200000080
  .... BP    0x0000ffffd1510e10
  .... SP    0x0000000000000000
  .... IP    0x00000000000000dd
  .... FLAGS 0x0000ffffd1510cd0
  .... CS    0x0000000000000000
  .... SS    0x0000000000000030
  .... DS    0x0000ffffa569a208
  .... ES    0x0000000000000000
  .... FS    0x0000000000000000
  .... GS    0x0000000000000000
  .... R8    0x0000aaaad3de9650
  .... R9    0x0000ffffa57397f0
  .... R10   0x0000000000000001
  .... R11   0x0000ffffa57fd000
  .... R12   0x0000ffffd1515817
  .... R13   0x0000ffffd1515480
  .... R14   0x0000aaaadabf6c80
  .... R15   0x0000000000000000
  .... unknown 0x0000000000000001
  .... unknown 0x0000000000000000
  .... unknown 0x0000000000000000
  .... unknown 0x0000000000000000
  .... unknown 0x0000000000000000
  .... unknown 0x0000ffffd1510d90
  .... unknown 0x0000ffffa5739b90
  .... unknown 0x0000ffffd1510d80
  .... XMM0  0x0000ffffa57392c8
   ... thread: perf-exec:43239
   ...... dso: [kernel.kallsyms]

As can be seen, the register names correspond to X86 registers, even
though the perf.data file was recorded on an ARM64 system. After this
patch, the output of the command displays the correct register names:

  # perf report -i perf_arm64.data -D
  [...]
  24661932634451 0x698 [0x21d0]: PERF_RECORD_SAMPLE(IP, 0x1): 43239/43239: 0xffffc5be8f100f98 period: 1 addr: 0
  ... user regs: mask 0x1ffffffff ABI 64-bit
  .... x0    0x0000ffffd1515817
  .... x1    0x0000ffffd1515480
  .... x2    0x0000aaaadabf6c80
  .... x3    0x000000000000002e
  .... x4    0x0000000040100401
  .... x5    0x0040600200000080
  .... x6    0x0000ffffd1510e10
  .... x7    0x0000000000000000
  .... x8    0x00000000000000dd
  .... x9    0x0000ffffd1510cd0
  .... x10   0x0000000000000000
  .... x11   0x0000000000000030
  .... x12   0x0000ffffa569a208
  .... x13   0x0000000000000000
  .... x14   0x0000000000000000
  .... x15   0x0000000000000000
  .... x16   0x0000aaaad3de9650
  .... x17   0x0000ffffa57397f0
  .... x18   0x0000000000000001
  .... x19   0x0000ffffa57fd000
  .... x20   0x0000ffffd1515817
  .... x21   0x0000ffffd1515480
  .... x22   0x0000aaaadabf6c80
  .... x23   0x0000000000000000
  .... x24   0x0000000000000001
  .... x25   0x0000000000000000
  .... x26   0x0000000000000000
  .... x27   0x0000000000000000
  .... x28   0x0000000000000000
  .... x29   0x0000ffffd1510d90
  .... lr    0x0000ffffa5739b90
  .... sp    0x0000ffffd1510d80
  .... pc    0x0000ffffa57392c8
   ... thread: perf-exec:43239
   ...... dso: [kernel.kallsyms]

Tester comments:

Athira reports:

"Looks good to me. Tested this patchset in powerpc by capturing regs in
powerpc and doing perf report to read the data from x86."

Reported-by: Alexandre Truong <alexandre.truong@arm.com>
Reviewed-by: Athira Jajeev <atrajeev@linux.vnet.ibm.com>
Signed-off-by: German Gomez <german.gomez@arm.com>
Tested-by: Athira Jajeev <atrajeev@linux.vnet.ibm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-csky@vger.kernel.org
Cc: linux-riscv@lists.infradead.org
Link: https://lore.kernel.org/r/20211207180653.1147374-4-german.gomez@arm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/arm/include/perf_regs.h            |  42 --
 tools/perf/arch/arm64/include/perf_regs.h          |  76 ---
 tools/perf/arch/csky/include/perf_regs.h           |  82 ---
 tools/perf/arch/mips/include/perf_regs.h           |  69 ---
 tools/perf/arch/powerpc/include/perf_regs.h        |  66 --
 tools/perf/arch/riscv/include/perf_regs.h          |  74 ---
 tools/perf/arch/s390/include/perf_regs.h           |  78 ---
 tools/perf/arch/x86/include/perf_regs.h            |  82 ---
 tools/perf/builtin-script.c                        |  18 +-
 tools/perf/util/perf_regs.c                        | 666 +++++++++++++++++++++
 tools/perf/util/perf_regs.h                        |  10 +-
 .../util/scripting-engines/trace-event-python.c    |  10 +-
 tools/perf/util/session.c                          |  25 +-
 13 files changed, 697 insertions(+), 601 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/arch/arm/include/perf_regs.h b/tools/perf/arch/arm/include/perf_regs.h
index 4085419283d0..99a06550e25d 100644
--- a/tools/perf/arch/arm/include/perf_regs.h
+++ b/tools/perf/arch/arm/include/perf_regs.h
@@ -15,46 +15,4 @@ void perf_regs_load(u64 *regs);
 #define PERF_REG_IP	PERF_REG_ARM_PC
 #define PERF_REG_SP	PERF_REG_ARM_SP
 
-static inline const char *__perf_reg_name(int id)
-{
-	switch (id) {
-	case PERF_REG_ARM_R0:
-		return "r0";
-	case PERF_REG_ARM_R1:
-		return "r1";
-	case PERF_REG_ARM_R2:
-		return "r2";
-	case PERF_REG_ARM_R3:
-		return "r3";
-	case PERF_REG_ARM_R4:
-		return "r4";
-	case PERF_REG_ARM_R5:
-		return "r5";
-	case PERF_REG_ARM_R6:
-		return "r6";
-	case PERF_REG_ARM_R7:
-		return "r7";
-	case PERF_REG_ARM_R8:
-		return "r8";
-	case PERF_REG_ARM_R9:
-		return "r9";
-	case PERF_REG_ARM_R10:
-		return "r10";
-	case PERF_REG_ARM_FP:
-		return "fp";
-	case PERF_REG_ARM_IP:
-		return "ip";
-	case PERF_REG_ARM_SP:
-		return "sp";
-	case PERF_REG_ARM_LR:
-		return "lr";
-	case PERF_REG_ARM_PC:
-		return "pc";
-	default:
-		return NULL;
-	}
-
-	return NULL;
-}
-
 #endif /* ARCH_PERF_REGS_H */
diff --git a/tools/perf/arch/arm64/include/perf_regs.h b/tools/perf/arch/arm64/include/perf_regs.h
index 1f0d78b9f23b..35a3cc775b39 100644
--- a/tools/perf/arch/arm64/include/perf_regs.h
+++ b/tools/perf/arch/arm64/include/perf_regs.h
@@ -17,80 +17,4 @@ void perf_regs_load(u64 *regs);
 #define PERF_REG_IP	PERF_REG_ARM64_PC
 #define PERF_REG_SP	PERF_REG_ARM64_SP
 
-static inline const char *__perf_reg_name(int id)
-{
-	switch (id) {
-	case PERF_REG_ARM64_X0:
-		return "x0";
-	case PERF_REG_ARM64_X1:
-		return "x1";
-	case PERF_REG_ARM64_X2:
-		return "x2";
-	case PERF_REG_ARM64_X3:
-		return "x3";
-	case PERF_REG_ARM64_X4:
-		return "x4";
-	case PERF_REG_ARM64_X5:
-		return "x5";
-	case PERF_REG_ARM64_X6:
-		return "x6";
-	case PERF_REG_ARM64_X7:
-		return "x7";
-	case PERF_REG_ARM64_X8:
-		return "x8";
-	case PERF_REG_ARM64_X9:
-		return "x9";
-	case PERF_REG_ARM64_X10:
-		return "x10";
-	case PERF_REG_ARM64_X11:
-		return "x11";
-	case PERF_REG_ARM64_X12:
-		return "x12";
-	case PERF_REG_ARM64_X13:
-		return "x13";
-	case PERF_REG_ARM64_X14:
-		return "x14";
-	case PERF_REG_ARM64_X15:
-		return "x15";
-	case PERF_REG_ARM64_X16:
-		return "x16";
-	case PERF_REG_ARM64_X17:
-		return "x17";
-	case PERF_REG_ARM64_X18:
-		return "x18";
-	case PERF_REG_ARM64_X19:
-		return "x19";
-	case PERF_REG_ARM64_X20:
-		return "x20";
-	case PERF_REG_ARM64_X21:
-		return "x21";
-	case PERF_REG_ARM64_X22:
-		return "x22";
-	case PERF_REG_ARM64_X23:
-		return "x23";
-	case PERF_REG_ARM64_X24:
-		return "x24";
-	case PERF_REG_ARM64_X25:
-		return "x25";
-	case PERF_REG_ARM64_X26:
-		return "x26";
-	case PERF_REG_ARM64_X27:
-		return "x27";
-	case PERF_REG_ARM64_X28:
-		return "x28";
-	case PERF_REG_ARM64_X29:
-		return "x29";
-	case PERF_REG_ARM64_SP:
-		return "sp";
-	case PERF_REG_ARM64_LR:
-		return "lr";
-	case PERF_REG_ARM64_PC:
-		return "pc";
-	default:
-		return NULL;
-	}
-
-	return NULL;
-}
-
 #endif /* ARCH_PERF_REGS_H */
diff --git a/tools/perf/arch/csky/include/perf_regs.h b/tools/perf/arch/csky/include/perf_regs.h
index 25ac3bdcb9d1..1afcc0e916c2 100644
--- a/tools/perf/arch/csky/include/perf_regs.h
+++ b/tools/perf/arch/csky/include/perf_regs.h
@@ -15,86 +15,4 @@
 #define PERF_REG_IP	PERF_REG_CSKY_PC
 #define PERF_REG_SP	PERF_REG_CSKY_SP
 
-static inline const char *__perf_reg_name(int id)
-{
-	switch (id) {
-	case PERF_REG_CSKY_A0:
-		return "a0";
-	case PERF_REG_CSKY_A1:
-		return "a1";
-	case PERF_REG_CSKY_A2:
-		return "a2";
-	case PERF_REG_CSKY_A3:
-		return "a3";
-	case PERF_REG_CSKY_REGS0:
-		return "regs0";
-	case PERF_REG_CSKY_REGS1:
-		return "regs1";
-	case PERF_REG_CSKY_REGS2:
-		return "regs2";
-	case PERF_REG_CSKY_REGS3:
-		return "regs3";
-	case PERF_REG_CSKY_REGS4:
-		return "regs4";
-	case PERF_REG_CSKY_REGS5:
-		return "regs5";
-	case PERF_REG_CSKY_REGS6:
-		return "regs6";
-	case PERF_REG_CSKY_REGS7:
-		return "regs7";
-	case PERF_REG_CSKY_REGS8:
-		return "regs8";
-	case PERF_REG_CSKY_REGS9:
-		return "regs9";
-	case PERF_REG_CSKY_SP:
-		return "sp";
-	case PERF_REG_CSKY_LR:
-		return "lr";
-	case PERF_REG_CSKY_PC:
-		return "pc";
-#if defined(__CSKYABIV2__)
-	case PERF_REG_CSKY_EXREGS0:
-		return "exregs0";
-	case PERF_REG_CSKY_EXREGS1:
-		return "exregs1";
-	case PERF_REG_CSKY_EXREGS2:
-		return "exregs2";
-	case PERF_REG_CSKY_EXREGS3:
-		return "exregs3";
-	case PERF_REG_CSKY_EXREGS4:
-		return "exregs4";
-	case PERF_REG_CSKY_EXREGS5:
-		return "exregs5";
-	case PERF_REG_CSKY_EXREGS6:
-		return "exregs6";
-	case PERF_REG_CSKY_EXREGS7:
-		return "exregs7";
-	case PERF_REG_CSKY_EXREGS8:
-		return "exregs8";
-	case PERF_REG_CSKY_EXREGS9:
-		return "exregs9";
-	case PERF_REG_CSKY_EXREGS10:
-		return "exregs10";
-	case PERF_REG_CSKY_EXREGS11:
-		return "exregs11";
-	case PERF_REG_CSKY_EXREGS12:
-		return "exregs12";
-	case PERF_REG_CSKY_EXREGS13:
-		return "exregs13";
-	case PERF_REG_CSKY_EXREGS14:
-		return "exregs14";
-	case PERF_REG_CSKY_TLS:
-		return "tls";
-	case PERF_REG_CSKY_HI:
-		return "hi";
-	case PERF_REG_CSKY_LO:
-		return "lo";
-#endif
-	default:
-		return NULL;
-	}
-
-	return NULL;
-}
-
 #endif /* ARCH_PERF_REGS_H */
diff --git a/tools/perf/arch/mips/include/perf_regs.h b/tools/perf/arch/mips/include/perf_regs.h
index ee73b36a14d1..b8cd8bbb37ba 100644
--- a/tools/perf/arch/mips/include/perf_regs.h
+++ b/tools/perf/arch/mips/include/perf_regs.h
@@ -12,73 +12,4 @@
 
 #define PERF_REGS_MASK ((1ULL << PERF_REG_MIPS_MAX) - 1)
 
-static inline const char *__perf_reg_name(int id)
-{
-	switch (id) {
-	case PERF_REG_MIPS_PC:
-		return "PC";
-	case PERF_REG_MIPS_R1:
-		return "$1";
-	case PERF_REG_MIPS_R2:
-		return "$2";
-	case PERF_REG_MIPS_R3:
-		return "$3";
-	case PERF_REG_MIPS_R4:
-		return "$4";
-	case PERF_REG_MIPS_R5:
-		return "$5";
-	case PERF_REG_MIPS_R6:
-		return "$6";
-	case PERF_REG_MIPS_R7:
-		return "$7";
-	case PERF_REG_MIPS_R8:
-		return "$8";
-	case PERF_REG_MIPS_R9:
-		return "$9";
-	case PERF_REG_MIPS_R10:
-		return "$10";
-	case PERF_REG_MIPS_R11:
-		return "$11";
-	case PERF_REG_MIPS_R12:
-		return "$12";
-	case PERF_REG_MIPS_R13:
-		return "$13";
-	case PERF_REG_MIPS_R14:
-		return "$14";
-	case PERF_REG_MIPS_R15:
-		return "$15";
-	case PERF_REG_MIPS_R16:
-		return "$16";
-	case PERF_REG_MIPS_R17:
-		return "$17";
-	case PERF_REG_MIPS_R18:
-		return "$18";
-	case PERF_REG_MIPS_R19:
-		return "$19";
-	case PERF_REG_MIPS_R20:
-		return "$20";
-	case PERF_REG_MIPS_R21:
-		return "$21";
-	case PERF_REG_MIPS_R22:
-		return "$22";
-	case PERF_REG_MIPS_R23:
-		return "$23";
-	case PERF_REG_MIPS_R24:
-		return "$24";
-	case PERF_REG_MIPS_R25:
-		return "$25";
-	case PERF_REG_MIPS_R28:
-		return "$28";
-	case PERF_REG_MIPS_R29:
-		return "$29";
-	case PERF_REG_MIPS_R30:
-		return "$30";
-	case PERF_REG_MIPS_R31:
-		return "$31";
-	default:
-		break;
-	}
-	return NULL;
-}
-
 #endif /* ARCH_PERF_REGS_H */
diff --git a/tools/perf/arch/powerpc/include/perf_regs.h b/tools/perf/arch/powerpc/include/perf_regs.h
index 93339d17acc4..9bb17c3f370b 100644
--- a/tools/perf/arch/powerpc/include/perf_regs.h
+++ b/tools/perf/arch/powerpc/include/perf_regs.h
@@ -19,70 +19,4 @@ void perf_regs_load(u64 *regs);
 #define PERF_REG_IP     PERF_REG_POWERPC_NIP
 #define PERF_REG_SP     PERF_REG_POWERPC_R1
 
-static const char *reg_names[] = {
-	[PERF_REG_POWERPC_R0] = "r0",
-	[PERF_REG_POWERPC_R1] = "r1",
-	[PERF_REG_POWERPC_R2] = "r2",
-	[PERF_REG_POWERPC_R3] = "r3",
-	[PERF_REG_POWERPC_R4] = "r4",
-	[PERF_REG_POWERPC_R5] = "r5",
-	[PERF_REG_POWERPC_R6] = "r6",
-	[PERF_REG_POWERPC_R7] = "r7",
-	[PERF_REG_POWERPC_R8] = "r8",
-	[PERF_REG_POWERPC_R9] = "r9",
-	[PERF_REG_POWERPC_R10] = "r10",
-	[PERF_REG_POWERPC_R11] = "r11",
-	[PERF_REG_POWERPC_R12] = "r12",
-	[PERF_REG_POWERPC_R13] = "r13",
-	[PERF_REG_POWERPC_R14] = "r14",
-	[PERF_REG_POWERPC_R15] = "r15",
-	[PERF_REG_POWERPC_R16] = "r16",
-	[PERF_REG_POWERPC_R17] = "r17",
-	[PERF_REG_POWERPC_R18] = "r18",
-	[PERF_REG_POWERPC_R19] = "r19",
-	[PERF_REG_POWERPC_R20] = "r20",
-	[PERF_REG_POWERPC_R21] = "r21",
-	[PERF_REG_POWERPC_R22] = "r22",
-	[PERF_REG_POWERPC_R23] = "r23",
-	[PERF_REG_POWERPC_R24] = "r24",
-	[PERF_REG_POWERPC_R25] = "r25",
-	[PERF_REG_POWERPC_R26] = "r26",
-	[PERF_REG_POWERPC_R27] = "r27",
-	[PERF_REG_POWERPC_R28] = "r28",
-	[PERF_REG_POWERPC_R29] = "r29",
-	[PERF_REG_POWERPC_R30] = "r30",
-	[PERF_REG_POWERPC_R31] = "r31",
-	[PERF_REG_POWERPC_NIP] = "nip",
-	[PERF_REG_POWERPC_MSR] = "msr",
-	[PERF_REG_POWERPC_ORIG_R3] = "orig_r3",
-	[PERF_REG_POWERPC_CTR] = "ctr",
-	[PERF_REG_POWERPC_LINK] = "link",
-	[PERF_REG_POWERPC_XER] = "xer",
-	[PERF_REG_POWERPC_CCR] = "ccr",
-	[PERF_REG_POWERPC_SOFTE] = "softe",
-	[PERF_REG_POWERPC_TRAP] = "trap",
-	[PERF_REG_POWERPC_DAR] = "dar",
-	[PERF_REG_POWERPC_DSISR] = "dsisr",
-	[PERF_REG_POWERPC_SIER] = "sier",
-	[PERF_REG_POWERPC_MMCRA] = "mmcra",
-	[PERF_REG_POWERPC_MMCR0] = "mmcr0",
-	[PERF_REG_POWERPC_MMCR1] = "mmcr1",
-	[PERF_REG_POWERPC_MMCR2] = "mmcr2",
-	[PERF_REG_POWERPC_MMCR3] = "mmcr3",
-	[PERF_REG_POWERPC_SIER2] = "sier2",
-	[PERF_REG_POWERPC_SIER3] = "sier3",
-	[PERF_REG_POWERPC_PMC1] = "pmc1",
-	[PERF_REG_POWERPC_PMC2] = "pmc2",
-	[PERF_REG_POWERPC_PMC3] = "pmc3",
-	[PERF_REG_POWERPC_PMC4] = "pmc4",
-	[PERF_REG_POWERPC_PMC5] = "pmc5",
-	[PERF_REG_POWERPC_PMC6] = "pmc6",
-	[PERF_REG_POWERPC_SDAR] = "sdar",
-	[PERF_REG_POWERPC_SIAR] = "siar",
-};
-
-static inline const char *__perf_reg_name(int id)
-{
-	return reg_names[id];
-}
 #endif /* ARCH_PERF_REGS_H */
diff --git a/tools/perf/arch/riscv/include/perf_regs.h b/tools/perf/arch/riscv/include/perf_regs.h
index 6b02a767c918..6944bf0de53e 100644
--- a/tools/perf/arch/riscv/include/perf_regs.h
+++ b/tools/perf/arch/riscv/include/perf_regs.h
@@ -19,78 +19,4 @@
 #define PERF_REG_IP	PERF_REG_RISCV_PC
 #define PERF_REG_SP	PERF_REG_RISCV_SP
 
-static inline const char *__perf_reg_name(int id)
-{
-	switch (id) {
-	case PERF_REG_RISCV_PC:
-		return "pc";
-	case PERF_REG_RISCV_RA:
-		return "ra";
-	case PERF_REG_RISCV_SP:
-		return "sp";
-	case PERF_REG_RISCV_GP:
-		return "gp";
-	case PERF_REG_RISCV_TP:
-		return "tp";
-	case PERF_REG_RISCV_T0:
-		return "t0";
-	case PERF_REG_RISCV_T1:
-		return "t1";
-	case PERF_REG_RISCV_T2:
-		return "t2";
-	case PERF_REG_RISCV_S0:
-		return "s0";
-	case PERF_REG_RISCV_S1:
-		return "s1";
-	case PERF_REG_RISCV_A0:
-		return "a0";
-	case PERF_REG_RISCV_A1:
-		return "a1";
-	case PERF_REG_RISCV_A2:
-		return "a2";
-	case PERF_REG_RISCV_A3:
-		return "a3";
-	case PERF_REG_RISCV_A4:
-		return "a4";
-	case PERF_REG_RISCV_A5:
-		return "a5";
-	case PERF_REG_RISCV_A6:
-		return "a6";
-	case PERF_REG_RISCV_A7:
-		return "a7";
-	case PERF_REG_RISCV_S2:
-		return "s2";
-	case PERF_REG_RISCV_S3:
-		return "s3";
-	case PERF_REG_RISCV_S4:
-		return "s4";
-	case PERF_REG_RISCV_S5:
-		return "s5";
-	case PERF_REG_RISCV_S6:
-		return "s6";
-	case PERF_REG_RISCV_S7:
-		return "s7";
-	case PERF_REG_RISCV_S8:
-		return "s8";
-	case PERF_REG_RISCV_S9:
-		return "s9";
-	case PERF_REG_RISCV_S10:
-		return "s10";
-	case PERF_REG_RISCV_S11:
-		return "s11";
-	case PERF_REG_RISCV_T3:
-		return "t3";
-	case PERF_REG_RISCV_T4:
-		return "t4";
-	case PERF_REG_RISCV_T5:
-		return "t5";
-	case PERF_REG_RISCV_T6:
-		return "t6";
-	default:
-		return NULL;
-	}
-
-	return NULL;
-}
-
 #endif /* ARCH_PERF_REGS_H */
diff --git a/tools/perf/arch/s390/include/perf_regs.h b/tools/perf/arch/s390/include/perf_regs.h
index ce3031526623..52fcc0891da6 100644
--- a/tools/perf/arch/s390/include/perf_regs.h
+++ b/tools/perf/arch/s390/include/perf_regs.h
@@ -14,82 +14,4 @@ void perf_regs_load(u64 *regs);
 #define PERF_REG_IP PERF_REG_S390_PC
 #define PERF_REG_SP PERF_REG_S390_R15
 
-static inline const char *__perf_reg_name(int id)
-{
-	switch (id) {
-	case PERF_REG_S390_R0:
-		return "R0";
-	case PERF_REG_S390_R1:
-		return "R1";
-	case PERF_REG_S390_R2:
-		return "R2";
-	case PERF_REG_S390_R3:
-		return "R3";
-	case PERF_REG_S390_R4:
-		return "R4";
-	case PERF_REG_S390_R5:
-		return "R5";
-	case PERF_REG_S390_R6:
-		return "R6";
-	case PERF_REG_S390_R7:
-		return "R7";
-	case PERF_REG_S390_R8:
-		return "R8";
-	case PERF_REG_S390_R9:
-		return "R9";
-	case PERF_REG_S390_R10:
-		return "R10";
-	case PERF_REG_S390_R11:
-		return "R11";
-	case PERF_REG_S390_R12:
-		return "R12";
-	case PERF_REG_S390_R13:
-		return "R13";
-	case PERF_REG_S390_R14:
-		return "R14";
-	case PERF_REG_S390_R15:
-		return "R15";
-	case PERF_REG_S390_FP0:
-		return "FP0";
-	case PERF_REG_S390_FP1:
-		return "FP1";
-	case PERF_REG_S390_FP2:
-		return "FP2";
-	case PERF_REG_S390_FP3:
-		return "FP3";
-	case PERF_REG_S390_FP4:
-		return "FP4";
-	case PERF_REG_S390_FP5:
-		return "FP5";
-	case PERF_REG_S390_FP6:
-		return "FP6";
-	case PERF_REG_S390_FP7:
-		return "FP7";
-	case PERF_REG_S390_FP8:
-		return "FP8";
-	case PERF_REG_S390_FP9:
-		return "FP9";
-	case PERF_REG_S390_FP10:
-		return "FP10";
-	case PERF_REG_S390_FP11:
-		return "FP11";
-	case PERF_REG_S390_FP12:
-		return "FP12";
-	case PERF_REG_S390_FP13:
-		return "FP13";
-	case PERF_REG_S390_FP14:
-		return "FP14";
-	case PERF_REG_S390_FP15:
-		return "FP15";
-	case PERF_REG_S390_MASK:
-		return "MASK";
-	case PERF_REG_S390_PC:
-		return "PC";
-	default:
-		return NULL;
-	}
-
-	return NULL;
-}
-
 #endif /* ARCH_PERF_REGS_H */
diff --git a/tools/perf/arch/x86/include/perf_regs.h b/tools/perf/arch/x86/include/perf_regs.h
index cddc4cdc0d9b..16e23b722042 100644
--- a/tools/perf/arch/x86/include/perf_regs.h
+++ b/tools/perf/arch/x86/include/perf_regs.h
@@ -23,86 +23,4 @@ void perf_regs_load(u64 *regs);
 #define PERF_REG_IP PERF_REG_X86_IP
 #define PERF_REG_SP PERF_REG_X86_SP
 
-static inline const char *__perf_reg_name(int id)
-{
-	switch (id) {
-	case PERF_REG_X86_AX:
-		return "AX";
-	case PERF_REG_X86_BX:
-		return "BX";
-	case PERF_REG_X86_CX:
-		return "CX";
-	case PERF_REG_X86_DX:
-		return "DX";
-	case PERF_REG_X86_SI:
-		return "SI";
-	case PERF_REG_X86_DI:
-		return "DI";
-	case PERF_REG_X86_BP:
-		return "BP";
-	case PERF_REG_X86_SP:
-		return "SP";
-	case PERF_REG_X86_IP:
-		return "IP";
-	case PERF_REG_X86_FLAGS:
-		return "FLAGS";
-	case PERF_REG_X86_CS:
-		return "CS";
-	case PERF_REG_X86_SS:
-		return "SS";
-	case PERF_REG_X86_DS:
-		return "DS";
-	case PERF_REG_X86_ES:
-		return "ES";
-	case PERF_REG_X86_FS:
-		return "FS";
-	case PERF_REG_X86_GS:
-		return "GS";
-#ifdef HAVE_ARCH_X86_64_SUPPORT
-	case PERF_REG_X86_R8:
-		return "R8";
-	case PERF_REG_X86_R9:
-		return "R9";
-	case PERF_REG_X86_R10:
-		return "R10";
-	case PERF_REG_X86_R11:
-		return "R11";
-	case PERF_REG_X86_R12:
-		return "R12";
-	case PERF_REG_X86_R13:
-		return "R13";
-	case PERF_REG_X86_R14:
-		return "R14";
-	case PERF_REG_X86_R15:
-		return "R15";
-#endif /* HAVE_ARCH_X86_64_SUPPORT */
-
-#define XMM(x) \
-	case PERF_REG_X86_XMM ## x:	\
-	case PERF_REG_X86_XMM ## x + 1:	\
-		return "XMM" #x;
-	XMM(0)
-	XMM(1)
-	XMM(2)
-	XMM(3)
-	XMM(4)
-	XMM(5)
-	XMM(6)
-	XMM(7)
-	XMM(8)
-	XMM(9)
-	XMM(10)
-	XMM(11)
-	XMM(12)
-	XMM(13)
-	XMM(14)
-	XMM(15)
-#undef XMM
-	default:
-		return NULL;
-	}
-
-	return NULL;
-}
-
 #endif /* ARCH_PERF_REGS_H */
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 9434367af166..da2175d70ac9 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -15,6 +15,7 @@
 #include "util/symbol.h"
 #include "util/thread.h"
 #include "util/trace-event.h"
+#include "util/env.h"
 #include "util/evlist.h"
 #include "util/evsel.h"
 #include "util/evsel_fprintf.h"
@@ -648,7 +649,7 @@ out:
 	return 0;
 }
 
-static int perf_sample__fprintf_regs(struct regs_dump *regs, uint64_t mask,
+static int perf_sample__fprintf_regs(struct regs_dump *regs, uint64_t mask, const char *arch,
 				     FILE *fp)
 {
 	unsigned i = 0, r;
@@ -661,7 +662,7 @@ static int perf_sample__fprintf_regs(struct regs_dump *regs, uint64_t mask,
 
 	for_each_set_bit(r, (unsigned long *) &mask, sizeof(mask) * 8) {
 		u64 val = regs->regs[i++];
-		printed += fprintf(fp, "%5s:0x%"PRIx64" ", perf_reg_name(r), val);
+		printed += fprintf(fp, "%5s:0x%"PRIx64" ", perf_reg_name(r, arch), val);
 	}
 
 	return printed;
@@ -718,17 +719,17 @@ tod_scnprintf(struct perf_script *script, char *buf, int buflen,
 }
 
 static int perf_sample__fprintf_iregs(struct perf_sample *sample,
-				      struct perf_event_attr *attr, FILE *fp)
+				      struct perf_event_attr *attr, const char *arch, FILE *fp)
 {
 	return perf_sample__fprintf_regs(&sample->intr_regs,
-					 attr->sample_regs_intr, fp);
+					 attr->sample_regs_intr, arch, fp);
 }
 
 static int perf_sample__fprintf_uregs(struct perf_sample *sample,
-				      struct perf_event_attr *attr, FILE *fp)
+				      struct perf_event_attr *attr, const char *arch, FILE *fp)
 {
 	return perf_sample__fprintf_regs(&sample->user_regs,
-					 attr->sample_regs_user, fp);
+					 attr->sample_regs_user, arch, fp);
 }
 
 static int perf_sample__fprintf_start(struct perf_script *script,
@@ -2000,6 +2001,7 @@ static void process_event(struct perf_script *script,
 	struct evsel_script *es = evsel->priv;
 	FILE *fp = es->fp;
 	char str[PAGE_SIZE_NAME_LEN];
+	const char *arch = perf_env__arch(machine->env);
 
 	if (output[type].fields == 0)
 		return;
@@ -2066,10 +2068,10 @@ static void process_event(struct perf_script *script,
 	}
 
 	if (PRINT_FIELD(IREGS))
-		perf_sample__fprintf_iregs(sample, attr, fp);
+		perf_sample__fprintf_iregs(sample, attr, arch, fp);
 
 	if (PRINT_FIELD(UREGS))
-		perf_sample__fprintf_uregs(sample, attr, fp);
+		perf_sample__fprintf_uregs(sample, attr, arch, fp);
 
 	if (PRINT_FIELD(BRSTACK))
 		perf_sample__fprintf_brstack(sample, thread, attr, fp);
diff --git a/tools/perf/util/perf_regs.c b/tools/perf/util/perf_regs.c
index 06a7461ba864..a982e40ee5a9 100644
--- a/tools/perf/util/perf_regs.c
+++ b/tools/perf/util/perf_regs.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <errno.h>
+#include <string.h>
 #include "perf_regs.h"
 #include "event.h"
 
@@ -20,6 +21,671 @@ uint64_t __weak arch__user_reg_mask(void)
 }
 
 #ifdef HAVE_PERF_REGS_SUPPORT
+
+#define perf_event_arm_regs perf_event_arm64_regs
+#include "../../arch/arm64/include/uapi/asm/perf_regs.h"
+#undef perf_event_arm_regs
+
+#include "../../arch/arm/include/uapi/asm/perf_regs.h"
+#include "../../arch/csky/include/uapi/asm/perf_regs.h"
+#include "../../arch/mips/include/uapi/asm/perf_regs.h"
+#include "../../arch/powerpc/include/uapi/asm/perf_regs.h"
+#include "../../arch/riscv/include/uapi/asm/perf_regs.h"
+#include "../../arch/s390/include/uapi/asm/perf_regs.h"
+#include "../../arch/x86/include/uapi/asm/perf_regs.h"
+
+static const char *__perf_reg_name_arm64(int id)
+{
+	switch (id) {
+	case PERF_REG_ARM64_X0:
+		return "x0";
+	case PERF_REG_ARM64_X1:
+		return "x1";
+	case PERF_REG_ARM64_X2:
+		return "x2";
+	case PERF_REG_ARM64_X3:
+		return "x3";
+	case PERF_REG_ARM64_X4:
+		return "x4";
+	case PERF_REG_ARM64_X5:
+		return "x5";
+	case PERF_REG_ARM64_X6:
+		return "x6";
+	case PERF_REG_ARM64_X7:
+		return "x7";
+	case PERF_REG_ARM64_X8:
+		return "x8";
+	case PERF_REG_ARM64_X9:
+		return "x9";
+	case PERF_REG_ARM64_X10:
+		return "x10";
+	case PERF_REG_ARM64_X11:
+		return "x11";
+	case PERF_REG_ARM64_X12:
+		return "x12";
+	case PERF_REG_ARM64_X13:
+		return "x13";
+	case PERF_REG_ARM64_X14:
+		return "x14";
+	case PERF_REG_ARM64_X15:
+		return "x15";
+	case PERF_REG_ARM64_X16:
+		return "x16";
+	case PERF_REG_ARM64_X17:
+		return "x17";
+	case PERF_REG_ARM64_X18:
+		return "x18";
+	case PERF_REG_ARM64_X19:
+		return "x19";
+	case PERF_REG_ARM64_X20:
+		return "x20";
+	case PERF_REG_ARM64_X21:
+		return "x21";
+	case PERF_REG_ARM64_X22:
+		return "x22";
+	case PERF_REG_ARM64_X23:
+		return "x23";
+	case PERF_REG_ARM64_X24:
+		return "x24";
+	case PERF_REG_ARM64_X25:
+		return "x25";
+	case PERF_REG_ARM64_X26:
+		return "x26";
+	case PERF_REG_ARM64_X27:
+		return "x27";
+	case PERF_REG_ARM64_X28:
+		return "x28";
+	case PERF_REG_ARM64_X29:
+		return "x29";
+	case PERF_REG_ARM64_SP:
+		return "sp";
+	case PERF_REG_ARM64_LR:
+		return "lr";
+	case PERF_REG_ARM64_PC:
+		return "pc";
+	default:
+		return NULL;
+	}
+
+	return NULL;
+}
+
+static const char *__perf_reg_name_arm(int id)
+{
+	switch (id) {
+	case PERF_REG_ARM_R0:
+		return "r0";
+	case PERF_REG_ARM_R1:
+		return "r1";
+	case PERF_REG_ARM_R2:
+		return "r2";
+	case PERF_REG_ARM_R3:
+		return "r3";
+	case PERF_REG_ARM_R4:
+		return "r4";
+	case PERF_REG_ARM_R5:
+		return "r5";
+	case PERF_REG_ARM_R6:
+		return "r6";
+	case PERF_REG_ARM_R7:
+		return "r7";
+	case PERF_REG_ARM_R8:
+		return "r8";
+	case PERF_REG_ARM_R9:
+		return "r9";
+	case PERF_REG_ARM_R10:
+		return "r10";
+	case PERF_REG_ARM_FP:
+		return "fp";
+	case PERF_REG_ARM_IP:
+		return "ip";
+	case PERF_REG_ARM_SP:
+		return "sp";
+	case PERF_REG_ARM_LR:
+		return "lr";
+	case PERF_REG_ARM_PC:
+		return "pc";
+	default:
+		return NULL;
+	}
+
+	return NULL;
+}
+
+static const char *__perf_reg_name_csky(int id)
+{
+	switch (id) {
+	case PERF_REG_CSKY_A0:
+		return "a0";
+	case PERF_REG_CSKY_A1:
+		return "a1";
+	case PERF_REG_CSKY_A2:
+		return "a2";
+	case PERF_REG_CSKY_A3:
+		return "a3";
+	case PERF_REG_CSKY_REGS0:
+		return "regs0";
+	case PERF_REG_CSKY_REGS1:
+		return "regs1";
+	case PERF_REG_CSKY_REGS2:
+		return "regs2";
+	case PERF_REG_CSKY_REGS3:
+		return "regs3";
+	case PERF_REG_CSKY_REGS4:
+		return "regs4";
+	case PERF_REG_CSKY_REGS5:
+		return "regs5";
+	case PERF_REG_CSKY_REGS6:
+		return "regs6";
+	case PERF_REG_CSKY_REGS7:
+		return "regs7";
+	case PERF_REG_CSKY_REGS8:
+		return "regs8";
+	case PERF_REG_CSKY_REGS9:
+		return "regs9";
+	case PERF_REG_CSKY_SP:
+		return "sp";
+	case PERF_REG_CSKY_LR:
+		return "lr";
+	case PERF_REG_CSKY_PC:
+		return "pc";
+#if defined(__CSKYABIV2__)
+	case PERF_REG_CSKY_EXREGS0:
+		return "exregs0";
+	case PERF_REG_CSKY_EXREGS1:
+		return "exregs1";
+	case PERF_REG_CSKY_EXREGS2:
+		return "exregs2";
+	case PERF_REG_CSKY_EXREGS3:
+		return "exregs3";
+	case PERF_REG_CSKY_EXREGS4:
+		return "exregs4";
+	case PERF_REG_CSKY_EXREGS5:
+		return "exregs5";
+	case PERF_REG_CSKY_EXREGS6:
+		return "exregs6";
+	case PERF_REG_CSKY_EXREGS7:
+		return "exregs7";
+	case PERF_REG_CSKY_EXREGS8:
+		return "exregs8";
+	case PERF_REG_CSKY_EXREGS9:
+		return "exregs9";
+	case PERF_REG_CSKY_EXREGS10:
+		return "exregs10";
+	case PERF_REG_CSKY_EXREGS11:
+		return "exregs11";
+	case PERF_REG_CSKY_EXREGS12:
+		return "exregs12";
+	case PERF_REG_CSKY_EXREGS13:
+		return "exregs13";
+	case PERF_REG_CSKY_EXREGS14:
+		return "exregs14";
+	case PERF_REG_CSKY_TLS:
+		return "tls";
+	case PERF_REG_CSKY_HI:
+		return "hi";
+	case PERF_REG_CSKY_LO:
+		return "lo";
+#endif
+	default:
+		return NULL;
+	}
+
+	return NULL;
+}
+
+static const char *__perf_reg_name_mips(int id)
+{
+	switch (id) {
+	case PERF_REG_MIPS_PC:
+		return "PC";
+	case PERF_REG_MIPS_R1:
+		return "$1";
+	case PERF_REG_MIPS_R2:
+		return "$2";
+	case PERF_REG_MIPS_R3:
+		return "$3";
+	case PERF_REG_MIPS_R4:
+		return "$4";
+	case PERF_REG_MIPS_R5:
+		return "$5";
+	case PERF_REG_MIPS_R6:
+		return "$6";
+	case PERF_REG_MIPS_R7:
+		return "$7";
+	case PERF_REG_MIPS_R8:
+		return "$8";
+	case PERF_REG_MIPS_R9:
+		return "$9";
+	case PERF_REG_MIPS_R10:
+		return "$10";
+	case PERF_REG_MIPS_R11:
+		return "$11";
+	case PERF_REG_MIPS_R12:
+		return "$12";
+	case PERF_REG_MIPS_R13:
+		return "$13";
+	case PERF_REG_MIPS_R14:
+		return "$14";
+	case PERF_REG_MIPS_R15:
+		return "$15";
+	case PERF_REG_MIPS_R16:
+		return "$16";
+	case PERF_REG_MIPS_R17:
+		return "$17";
+	case PERF_REG_MIPS_R18:
+		return "$18";
+	case PERF_REG_MIPS_R19:
+		return "$19";
+	case PERF_REG_MIPS_R20:
+		return "$20";
+	case PERF_REG_MIPS_R21:
+		return "$21";
+	case PERF_REG_MIPS_R22:
+		return "$22";
+	case PERF_REG_MIPS_R23:
+		return "$23";
+	case PERF_REG_MIPS_R24:
+		return "$24";
+	case PERF_REG_MIPS_R25:
+		return "$25";
+	case PERF_REG_MIPS_R28:
+		return "$28";
+	case PERF_REG_MIPS_R29:
+		return "$29";
+	case PERF_REG_MIPS_R30:
+		return "$30";
+	case PERF_REG_MIPS_R31:
+		return "$31";
+	default:
+		break;
+	}
+	return NULL;
+}
+
+static const char *__perf_reg_name_powerpc(int id)
+{
+	switch (id) {
+	case PERF_REG_POWERPC_R0:
+		return "r0";
+	case PERF_REG_POWERPC_R1:
+		return "r1";
+	case PERF_REG_POWERPC_R2:
+		return "r2";
+	case PERF_REG_POWERPC_R3:
+		return "r3";
+	case PERF_REG_POWERPC_R4:
+		return "r4";
+	case PERF_REG_POWERPC_R5:
+		return "r5";
+	case PERF_REG_POWERPC_R6:
+		return "r6";
+	case PERF_REG_POWERPC_R7:
+		return "r7";
+	case PERF_REG_POWERPC_R8:
+		return "r8";
+	case PERF_REG_POWERPC_R9:
+		return "r9";
+	case PERF_REG_POWERPC_R10:
+		return "r10";
+	case PERF_REG_POWERPC_R11:
+		return "r11";
+	case PERF_REG_POWERPC_R12:
+		return "r12";
+	case PERF_REG_POWERPC_R13:
+		return "r13";
+	case PERF_REG_POWERPC_R14:
+		return "r14";
+	case PERF_REG_POWERPC_R15:
+		return "r15";
+	case PERF_REG_POWERPC_R16:
+		return "r16";
+	case PERF_REG_POWERPC_R17:
+		return "r17";
+	case PERF_REG_POWERPC_R18:
+		return "r18";
+	case PERF_REG_POWERPC_R19:
+		return "r19";
+	case PERF_REG_POWERPC_R20:
+		return "r20";
+	case PERF_REG_POWERPC_R21:
+		return "r21";
+	case PERF_REG_POWERPC_R22:
+		return "r22";
+	case PERF_REG_POWERPC_R23:
+		return "r23";
+	case PERF_REG_POWERPC_R24:
+		return "r24";
+	case PERF_REG_POWERPC_R25:
+		return "r25";
+	case PERF_REG_POWERPC_R26:
+		return "r26";
+	case PERF_REG_POWERPC_R27:
+		return "r27";
+	case PERF_REG_POWERPC_R28:
+		return "r28";
+	case PERF_REG_POWERPC_R29:
+		return "r29";
+	case PERF_REG_POWERPC_R30:
+		return "r30";
+	case PERF_REG_POWERPC_R31:
+		return "r31";
+	case PERF_REG_POWERPC_NIP:
+		return "nip";
+	case PERF_REG_POWERPC_MSR:
+		return "msr";
+	case PERF_REG_POWERPC_ORIG_R3:
+		return "orig_r3";
+	case PERF_REG_POWERPC_CTR:
+		return "ctr";
+	case PERF_REG_POWERPC_LINK:
+		return "link";
+	case PERF_REG_POWERPC_XER:
+		return "xer";
+	case PERF_REG_POWERPC_CCR:
+		return "ccr";
+	case PERF_REG_POWERPC_SOFTE:
+		return "softe";
+	case PERF_REG_POWERPC_TRAP:
+		return "trap";
+	case PERF_REG_POWERPC_DAR:
+		return "dar";
+	case PERF_REG_POWERPC_DSISR:
+		return "dsisr";
+	case PERF_REG_POWERPC_SIER:
+		return "sier";
+	case PERF_REG_POWERPC_MMCRA:
+		return "mmcra";
+	case PERF_REG_POWERPC_MMCR0:
+		return "mmcr0";
+	case PERF_REG_POWERPC_MMCR1:
+		return "mmcr1";
+	case PERF_REG_POWERPC_MMCR2:
+		return "mmcr2";
+	case PERF_REG_POWERPC_MMCR3:
+		return "mmcr3";
+	case PERF_REG_POWERPC_SIER2:
+		return "sier2";
+	case PERF_REG_POWERPC_SIER3:
+		return "sier3";
+	case PERF_REG_POWERPC_PMC1:
+		return "pmc1";
+	case PERF_REG_POWERPC_PMC2:
+		return "pmc2";
+	case PERF_REG_POWERPC_PMC3:
+		return "pmc3";
+	case PERF_REG_POWERPC_PMC4:
+		return "pmc4";
+	case PERF_REG_POWERPC_PMC5:
+		return "pmc5";
+	case PERF_REG_POWERPC_PMC6:
+		return "pmc6";
+	case PERF_REG_POWERPC_SDAR:
+		return "sdar";
+	case PERF_REG_POWERPC_SIAR:
+		return "siar";
+	default:
+		break;
+	}
+	return NULL;
+}
+
+static const char *__perf_reg_name_riscv(int id)
+{
+	switch (id) {
+	case PERF_REG_RISCV_PC:
+		return "pc";
+	case PERF_REG_RISCV_RA:
+		return "ra";
+	case PERF_REG_RISCV_SP:
+		return "sp";
+	case PERF_REG_RISCV_GP:
+		return "gp";
+	case PERF_REG_RISCV_TP:
+		return "tp";
+	case PERF_REG_RISCV_T0:
+		return "t0";
+	case PERF_REG_RISCV_T1:
+		return "t1";
+	case PERF_REG_RISCV_T2:
+		return "t2";
+	case PERF_REG_RISCV_S0:
+		return "s0";
+	case PERF_REG_RISCV_S1:
+		return "s1";
+	case PERF_REG_RISCV_A0:
+		return "a0";
+	case PERF_REG_RISCV_A1:
+		return "a1";
+	case PERF_REG_RISCV_A2:
+		return "a2";
+	case PERF_REG_RISCV_A3:
+		return "a3";
+	case PERF_REG_RISCV_A4:
+		return "a4";
+	case PERF_REG_RISCV_A5:
+		return "a5";
+	case PERF_REG_RISCV_A6:
+		return "a6";
+	case PERF_REG_RISCV_A7:
+		return "a7";
+	case PERF_REG_RISCV_S2:
+		return "s2";
+	case PERF_REG_RISCV_S3:
+		return "s3";
+	case PERF_REG_RISCV_S4:
+		return "s4";
+	case PERF_REG_RISCV_S5:
+		return "s5";
+	case PERF_REG_RISCV_S6:
+		return "s6";
+	case PERF_REG_RISCV_S7:
+		return "s7";
+	case PERF_REG_RISCV_S8:
+		return "s8";
+	case PERF_REG_RISCV_S9:
+		return "s9";
+	case PERF_REG_RISCV_S10:
+		return "s10";
+	case PERF_REG_RISCV_S11:
+		return "s11";
+	case PERF_REG_RISCV_T3:
+		return "t3";
+	case PERF_REG_RISCV_T4:
+		return "t4";
+	case PERF_REG_RISCV_T5:
+		return "t5";
+	case PERF_REG_RISCV_T6:
+		return "t6";
+	default:
+		return NULL;
+	}
+
+	return NULL;
+}
+
+static const char *__perf_reg_name_s390(int id)
+{
+	switch (id) {
+	case PERF_REG_S390_R0:
+		return "R0";
+	case PERF_REG_S390_R1:
+		return "R1";
+	case PERF_REG_S390_R2:
+		return "R2";
+	case PERF_REG_S390_R3:
+		return "R3";
+	case PERF_REG_S390_R4:
+		return "R4";
+	case PERF_REG_S390_R5:
+		return "R5";
+	case PERF_REG_S390_R6:
+		return "R6";
+	case PERF_REG_S390_R7:
+		return "R7";
+	case PERF_REG_S390_R8:
+		return "R8";
+	case PERF_REG_S390_R9:
+		return "R9";
+	case PERF_REG_S390_R10:
+		return "R10";
+	case PERF_REG_S390_R11:
+		return "R11";
+	case PERF_REG_S390_R12:
+		return "R12";
+	case PERF_REG_S390_R13:
+		return "R13";
+	case PERF_REG_S390_R14:
+		return "R14";
+	case PERF_REG_S390_R15:
+		return "R15";
+	case PERF_REG_S390_FP0:
+		return "FP0";
+	case PERF_REG_S390_FP1:
+		return "FP1";
+	case PERF_REG_S390_FP2:
+		return "FP2";
+	case PERF_REG_S390_FP3:
+		return "FP3";
+	case PERF_REG_S390_FP4:
+		return "FP4";
+	case PERF_REG_S390_FP5:
+		return "FP5";
+	case PERF_REG_S390_FP6:
+		return "FP6";
+	case PERF_REG_S390_FP7:
+		return "FP7";
+	case PERF_REG_S390_FP8:
+		return "FP8";
+	case PERF_REG_S390_FP9:
+		return "FP9";
+	case PERF_REG_S390_FP10:
+		return "FP10";
+	case PERF_REG_S390_FP11:
+		return "FP11";
+	case PERF_REG_S390_FP12:
+		return "FP12";
+	case PERF_REG_S390_FP13:
+		return "FP13";
+	case PERF_REG_S390_FP14:
+		return "FP14";
+	case PERF_REG_S390_FP15:
+		return "FP15";
+	case PERF_REG_S390_MASK:
+		return "MASK";
+	case PERF_REG_S390_PC:
+		return "PC";
+	default:
+		return NULL;
+	}
+
+	return NULL;
+}
+
+static const char *__perf_reg_name_x86(int id)
+{
+	switch (id) {
+	case PERF_REG_X86_AX:
+		return "AX";
+	case PERF_REG_X86_BX:
+		return "BX";
+	case PERF_REG_X86_CX:
+		return "CX";
+	case PERF_REG_X86_DX:
+		return "DX";
+	case PERF_REG_X86_SI:
+		return "SI";
+	case PERF_REG_X86_DI:
+		return "DI";
+	case PERF_REG_X86_BP:
+		return "BP";
+	case PERF_REG_X86_SP:
+		return "SP";
+	case PERF_REG_X86_IP:
+		return "IP";
+	case PERF_REG_X86_FLAGS:
+		return "FLAGS";
+	case PERF_REG_X86_CS:
+		return "CS";
+	case PERF_REG_X86_SS:
+		return "SS";
+	case PERF_REG_X86_DS:
+		return "DS";
+	case PERF_REG_X86_ES:
+		return "ES";
+	case PERF_REG_X86_FS:
+		return "FS";
+	case PERF_REG_X86_GS:
+		return "GS";
+	case PERF_REG_X86_R8:
+		return "R8";
+	case PERF_REG_X86_R9:
+		return "R9";
+	case PERF_REG_X86_R10:
+		return "R10";
+	case PERF_REG_X86_R11:
+		return "R11";
+	case PERF_REG_X86_R12:
+		return "R12";
+	case PERF_REG_X86_R13:
+		return "R13";
+	case PERF_REG_X86_R14:
+		return "R14";
+	case PERF_REG_X86_R15:
+		return "R15";
+
+#define XMM(x) \
+	case PERF_REG_X86_XMM ## x:	\
+	case PERF_REG_X86_XMM ## x + 1:	\
+		return "XMM" #x;
+	XMM(0)
+	XMM(1)
+	XMM(2)
+	XMM(3)
+	XMM(4)
+	XMM(5)
+	XMM(6)
+	XMM(7)
+	XMM(8)
+	XMM(9)
+	XMM(10)
+	XMM(11)
+	XMM(12)
+	XMM(13)
+	XMM(14)
+	XMM(15)
+#undef XMM
+	default:
+		return NULL;
+	}
+
+	return NULL;
+}
+
+const char *perf_reg_name(int id, const char *arch)
+{
+	const char *reg_name = NULL;
+
+	if (!strcmp(arch, "csky"))
+		reg_name = __perf_reg_name_csky(id);
+	else if (!strcmp(arch, "mips"))
+		reg_name = __perf_reg_name_mips(id);
+	else if (!strcmp(arch, "powerpc"))
+		reg_name = __perf_reg_name_powerpc(id);
+	else if (!strcmp(arch, "riscv"))
+		reg_name = __perf_reg_name_riscv(id);
+	else if (!strcmp(arch, "s390"))
+		reg_name = __perf_reg_name_s390(id);
+	else if (!strcmp(arch, "x86"))
+		reg_name = __perf_reg_name_x86(id);
+	else if (!strcmp(arch, "arm"))
+		reg_name = __perf_reg_name_arm(id);
+	else if (!strcmp(arch, "arm64"))
+		reg_name = __perf_reg_name_arm64(id);
+
+	return reg_name ?: "unknown";
+}
+
 int perf_reg_value(u64 *valp, struct regs_dump *regs, int id)
 {
 	int i, idx = 0;
diff --git a/tools/perf/util/perf_regs.h b/tools/perf/util/perf_regs.h
index eeac181ebccf..4e6b1299c571 100644
--- a/tools/perf/util/perf_regs.h
+++ b/tools/perf/util/perf_regs.h
@@ -31,22 +31,16 @@ extern const struct sample_reg sample_reg_masks[];
 
 #define DWARF_MINIMAL_REGS ((1ULL << PERF_REG_IP) | (1ULL << PERF_REG_SP))
 
+const char *perf_reg_name(int id, const char *arch);
 int perf_reg_value(u64 *valp, struct regs_dump *regs, int id);
 
-static inline const char *perf_reg_name(int id)
-{
-	const char *reg_name = __perf_reg_name(id);
-
-	return reg_name ?: "unknown";
-}
-
 #else
 #define PERF_REGS_MASK	0
 #define PERF_REGS_MAX	0
 
 #define DWARF_MINIMAL_REGS PERF_REGS_MASK
 
-static inline const char *perf_reg_name(int id __maybe_unused)
+static inline const char *perf_reg_name(int id __maybe_unused, const char *arch __maybe_unused)
 {
 	return "unknown";
 }
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index c0c010350bc2..0445bee9290f 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -36,6 +36,7 @@
 #include "../debug.h"
 #include "../dso.h"
 #include "../callchain.h"
+#include "../env.h"
 #include "../evsel.h"
 #include "../event.h"
 #include "../thread.h"
@@ -687,7 +688,7 @@ static void set_sample_datasrc_in_dict(PyObject *dict,
 			_PyUnicode_FromString(decode));
 }
 
-static void regs_map(struct regs_dump *regs, uint64_t mask, char *bf, int size)
+static void regs_map(struct regs_dump *regs, uint64_t mask, const char *arch, char *bf, int size)
 {
 	unsigned int i = 0, r;
 	int printed = 0;
@@ -702,7 +703,7 @@ static void regs_map(struct regs_dump *regs, uint64_t mask, char *bf, int size)
 
 		printed += scnprintf(bf + printed, size - printed,
 				     "%5s:0x%" PRIx64 " ",
-				     perf_reg_name(r), val);
+				     perf_reg_name(r, arch), val);
 	}
 }
 
@@ -711,6 +712,7 @@ static void set_regs_in_dict(PyObject *dict,
 			     struct evsel *evsel)
 {
 	struct perf_event_attr *attr = &evsel->core.attr;
+	const char *arch = perf_env__arch(evsel__env(evsel));
 
 	/*
 	 * Here value 28 is a constant size which can be used to print
@@ -722,12 +724,12 @@ static void set_regs_in_dict(PyObject *dict,
 	int size = __sw_hweight64(attr->sample_regs_intr) * 28;
 	char bf[size];
 
-	regs_map(&sample->intr_regs, attr->sample_regs_intr, bf, sizeof(bf));
+	regs_map(&sample->intr_regs, attr->sample_regs_intr, arch, bf, sizeof(bf));
 
 	pydict_set_item_string_decref(dict, "iregs",
 			_PyUnicode_FromString(bf));
 
-	regs_map(&sample->user_regs, attr->sample_regs_user, bf, sizeof(bf));
+	regs_map(&sample->user_regs, attr->sample_regs_user, arch, bf, sizeof(bf));
 
 	pydict_set_item_string_decref(dict, "uregs",
 			_PyUnicode_FromString(bf));
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index d8857d1b6d7c..e1a273048681 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -15,6 +15,7 @@
 #include "map_symbol.h"
 #include "branch.h"
 #include "debug.h"
+#include "env.h"
 #include "evlist.h"
 #include "evsel.h"
 #include "memswap.h"
@@ -1168,7 +1169,7 @@ static void branch_stack__printf(struct perf_sample *sample, bool callstack)
 	}
 }
 
-static void regs_dump__printf(u64 mask, u64 *regs)
+static void regs_dump__printf(u64 mask, u64 *regs, const char *arch)
 {
 	unsigned rid, i = 0;
 
@@ -1176,7 +1177,7 @@ static void regs_dump__printf(u64 mask, u64 *regs)
 		u64 val = regs[i++];
 
 		printf(".... %-5s 0x%016" PRIx64 "\n",
-		       perf_reg_name(rid), val);
+		       perf_reg_name(rid, arch), val);
 	}
 }
 
@@ -1194,7 +1195,7 @@ static inline const char *regs_dump_abi(struct regs_dump *d)
 	return regs_abi[d->abi];
 }
 
-static void regs__printf(const char *type, struct regs_dump *regs)
+static void regs__printf(const char *type, struct regs_dump *regs, const char *arch)
 {
 	u64 mask = regs->mask;
 
@@ -1203,23 +1204,23 @@ static void regs__printf(const char *type, struct regs_dump *regs)
 	       mask,
 	       regs_dump_abi(regs));
 
-	regs_dump__printf(mask, regs->regs);
+	regs_dump__printf(mask, regs->regs, arch);
 }
 
-static void regs_user__printf(struct perf_sample *sample)
+static void regs_user__printf(struct perf_sample *sample, const char *arch)
 {
 	struct regs_dump *user_regs = &sample->user_regs;
 
 	if (user_regs->regs)
-		regs__printf("user", user_regs);
+		regs__printf("user", user_regs, arch);
 }
 
-static void regs_intr__printf(struct perf_sample *sample)
+static void regs_intr__printf(struct perf_sample *sample, const char *arch)
 {
 	struct regs_dump *intr_regs = &sample->intr_regs;
 
 	if (intr_regs->regs)
-		regs__printf("intr", intr_regs);
+		regs__printf("intr", intr_regs, arch);
 }
 
 static void stack_user__printf(struct stack_dump *dump)
@@ -1304,7 +1305,7 @@ char *get_page_size_name(u64 size, char *str)
 }
 
 static void dump_sample(struct evsel *evsel, union perf_event *event,
-			struct perf_sample *sample)
+			struct perf_sample *sample, const char *arch)
 {
 	u64 sample_type;
 	char str[PAGE_SIZE_NAME_LEN];
@@ -1325,10 +1326,10 @@ static void dump_sample(struct evsel *evsel, union perf_event *event,
 		branch_stack__printf(sample, evsel__has_branch_callstack(evsel));
 
 	if (sample_type & PERF_SAMPLE_REGS_USER)
-		regs_user__printf(sample);
+		regs_user__printf(sample, arch);
 
 	if (sample_type & PERF_SAMPLE_REGS_INTR)
-		regs_intr__printf(sample);
+		regs_intr__printf(sample, arch);
 
 	if (sample_type & PERF_SAMPLE_STACK_USER)
 		stack_user__printf(&sample->user_stack);
@@ -1502,7 +1503,7 @@ static int machines__deliver_event(struct machines *machines,
 			++evlist->stats.nr_unknown_id;
 			return 0;
 		}
-		dump_sample(evsel, event, sample);
+		dump_sample(evsel, event, sample, perf_env__arch(machine->env));
 		if (machine == NULL) {
 			++evlist->stats.nr_unprocessable_samples;
 			return 0;
-- 
cgit v1.2.3


From 416e15ad17f84358ba3eca6b82378be97c793c62 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 15 Dec 2021 10:51:50 -0800
Subject: perf ftrace: Add 'trace' subcommand

This is a preparation to add more sub-commands for ftrace.  The
'trace' subcommand does the same thing when no subcommand is given.

Committer testing:

The previous mode, i.e. no subcommand and the new 'perf ftrace trace'
are equivalent:

  # perf ftrace -G check_preempt_curr sleep 0.00001
  # tracer: function_graph
  #
  # CPU  DURATION                  FUNCTION CALLS
  # |     |   |                     |   |   |   |
   25)               |  check_preempt_curr() {
   25)               |    resched_curr() {
   25)               |      native_smp_send_reschedule() {
   25)               |        default_send_IPI_single_phys() {
   25)   0.110 us    |          __default_send_IPI_dest_field();
   25)   0.490 us    |        }
   25)   0.640 us    |      }
   25)   0.850 us    |    }
   25)   2.060 us    |  }
  # perf ftrace trace -G check_preempt_curr sleep 0.00001
  # tracer: function_graph
  #
  # CPU  DURATION                  FUNCTION CALLS
  # |     |   |                     |   |   |   |
   10)               |  check_preempt_curr() {
   10)               |    resched_curr() {
   10)               |      native_smp_send_reschedule() {
   10)               |        default_send_IPI_single_phys() {
   10)   0.080 us    |          __default_send_IPI_dest_field();
   10)   0.460 us    |        }
   10)   0.610 us    |      }
   10)   0.830 us    |    }
   10)   2.020 us    |  }
  #

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com>
Cc: Changbin Du <changbin.du@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20211215185154.360314-2-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-ftrace.c | 35 +++++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c
index 87cb11a7a3ee..b28e762c5d54 100644
--- a/tools/perf/builtin-ftrace.c
+++ b/tools/perf/builtin-ftrace.c
@@ -879,17 +879,7 @@ int cmd_ftrace(int argc, const char **argv)
 		.tracer = DEFAULT_TRACER,
 		.target = { .uid = UINT_MAX, },
 	};
-	const char * const ftrace_usage[] = {
-		"perf ftrace [<options>] [<command>]",
-		"perf ftrace [<options>] -- <command> [<options>]",
-		NULL
-	};
-	const struct option ftrace_options[] = {
-	OPT_STRING('t', "tracer", &ftrace.tracer, "tracer",
-		   "Tracer to use: function_graph(default) or function"),
-	OPT_CALLBACK_DEFAULT('F', "funcs", NULL, "[FILTER]",
-			     "Show available functions to filter",
-			     opt_list_avail_functions, "*"),
+	const struct option common_options[] = {
 	OPT_STRING('p', "pid", &ftrace.target.pid, "pid",
 		   "Trace on existing process id"),
 	/* TODO: Add short option -t after -t/--tracer can be removed. */
@@ -901,6 +891,14 @@ int cmd_ftrace(int argc, const char **argv)
 		    "System-wide collection from all CPUs"),
 	OPT_STRING('C', "cpu", &ftrace.target.cpu_list, "cpu",
 		    "List of cpus to monitor"),
+	OPT_END()
+	};
+	const struct option ftrace_options[] = {
+	OPT_STRING('t', "tracer", &ftrace.tracer, "tracer",
+		   "Tracer to use: function_graph(default) or function"),
+	OPT_CALLBACK_DEFAULT('F', "funcs", NULL, "[FILTER]",
+			     "Show available functions to filter",
+			     opt_list_avail_functions, "*"),
 	OPT_CALLBACK('T', "trace-funcs", &ftrace.filters, "func",
 		     "Trace given functions using function tracer",
 		     parse_filter_func),
@@ -923,7 +921,15 @@ int cmd_ftrace(int argc, const char **argv)
 		    "Trace children processes"),
 	OPT_UINTEGER('D', "delay", &ftrace.initial_delay,
 		     "Number of milliseconds to wait before starting tracing after program start"),
-	OPT_END()
+	OPT_PARENT(common_options),
+	};
+
+	const char * const ftrace_usage[] = {
+		"perf ftrace [<options>] [<command>]",
+		"perf ftrace [<options>] -- [<command>] [<options>]",
+		"perf ftrace trace [<options>] [<command>]",
+		"perf ftrace trace [<options>] -- [<command>] [<options>]",
+		NULL
 	};
 
 	INIT_LIST_HEAD(&ftrace.filters);
@@ -935,6 +941,11 @@ int cmd_ftrace(int argc, const char **argv)
 	if (ret < 0)
 		return -1;
 
+	if (argc > 1 && !strcmp(argv[1], "trace")) {
+		argc--;
+		argv++;
+	}
+
 	argc = parse_options(argc, argv, ftrace_options, ftrace_usage,
 			    PARSE_OPT_STOP_AT_NON_OPTION);
 	if (!argc && target__none(&ftrace.target))
-- 
cgit v1.2.3


From a9b8ae8ae347941fefd6596f62586b13ae032e4b Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 15 Dec 2021 10:51:51 -0800
Subject: perf ftrace: Move out common code from __cmd_ftrace

The signal setup code and evlist__prepare_workload() can be used for
other subcommands.  Let's move them out of the __cmd_ftrace().  Then
it doesn't need to pass argc and argv.

On the other hand, select_tracer() is specific to the 'trace'
subcommand so it'd better moving it into the __cmd_ftrace().

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com>
Cc: Changbin Du <changbin.du@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20211215185154.360314-3-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-ftrace.c | 63 ++++++++++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 30 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c
index b28e762c5d54..0f8310bd0e6c 100644
--- a/tools/perf/builtin-ftrace.c
+++ b/tools/perf/builtin-ftrace.c
@@ -565,7 +565,24 @@ static int set_tracing_options(struct perf_ftrace *ftrace)
 	return 0;
 }
 
-static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv)
+static void select_tracer(struct perf_ftrace *ftrace)
+{
+	bool graph = !list_empty(&ftrace->graph_funcs) ||
+		     !list_empty(&ftrace->nograph_funcs);
+	bool func = !list_empty(&ftrace->filters) ||
+		    !list_empty(&ftrace->notrace);
+
+	/* The function_graph has priority over function tracer. */
+	if (graph)
+		ftrace->tracer = "function_graph";
+	else if (func)
+		ftrace->tracer = "function";
+	/* Otherwise, the default tracer is used. */
+
+	pr_debug("%s tracer is used\n", ftrace->tracer);
+}
+
+static int __cmd_ftrace(struct perf_ftrace *ftrace)
 {
 	char *trace_file;
 	int trace_fd;
@@ -586,10 +603,7 @@ static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv)
 		return -1;
 	}
 
-	signal(SIGINT, sig_handler);
-	signal(SIGUSR1, sig_handler);
-	signal(SIGCHLD, sig_handler);
-	signal(SIGPIPE, sig_handler);
+	select_tracer(ftrace);
 
 	if (reset_tracing_files(ftrace) < 0) {
 		pr_err("failed to reset ftrace\n");
@@ -600,11 +614,6 @@ static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv)
 	if (write_tracing_file("trace", "0") < 0)
 		goto out;
 
-	if (argc && evlist__prepare_workload(ftrace->evlist, &ftrace->target, argv, false,
-					     ftrace__workload_exec_failed_signal) < 0) {
-		goto out;
-	}
-
 	if (set_tracing_options(ftrace) < 0)
 		goto out_reset;
 
@@ -855,23 +864,6 @@ static int parse_graph_tracer_opts(const struct option *opt,
 	return 0;
 }
 
-static void select_tracer(struct perf_ftrace *ftrace)
-{
-	bool graph = !list_empty(&ftrace->graph_funcs) ||
-		     !list_empty(&ftrace->nograph_funcs);
-	bool func = !list_empty(&ftrace->filters) ||
-		    !list_empty(&ftrace->notrace);
-
-	/* The function_graph has priority over function tracer. */
-	if (graph)
-		ftrace->tracer = "function_graph";
-	else if (func)
-		ftrace->tracer = "function";
-	/* Otherwise, the default tracer is used. */
-
-	pr_debug("%s tracer is used\n", ftrace->tracer);
-}
-
 int cmd_ftrace(int argc, const char **argv)
 {
 	int ret;
@@ -937,6 +929,11 @@ int cmd_ftrace(int argc, const char **argv)
 	INIT_LIST_HEAD(&ftrace.graph_funcs);
 	INIT_LIST_HEAD(&ftrace.nograph_funcs);
 
+	signal(SIGINT, sig_handler);
+	signal(SIGUSR1, sig_handler);
+	signal(SIGCHLD, sig_handler);
+	signal(SIGPIPE, sig_handler);
+
 	ret = perf_config(perf_ftrace_config, &ftrace);
 	if (ret < 0)
 		return -1;
@@ -951,8 +948,6 @@ int cmd_ftrace(int argc, const char **argv)
 	if (!argc && target__none(&ftrace.target))
 		ftrace.target.system_wide = true;
 
-	select_tracer(&ftrace);
-
 	ret = target__validate(&ftrace.target);
 	if (ret) {
 		char errbuf[512];
@@ -972,7 +967,15 @@ int cmd_ftrace(int argc, const char **argv)
 	if (ret < 0)
 		goto out_delete_evlist;
 
-	ret = __cmd_ftrace(&ftrace, argc, argv);
+	if (argc) {
+		ret = evlist__prepare_workload(ftrace.evlist, &ftrace.target,
+					       argv, false,
+					       ftrace__workload_exec_failed_signal);
+		if (ret < 0)
+			goto out_delete_evlist;
+	}
+
+	ret = __cmd_ftrace(&ftrace);
 
 out_delete_evlist:
 	evlist__delete(ftrace.evlist);
-- 
cgit v1.2.3


From 53be50282269b46c678ae5a9f54acf7416a10dbb Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 15 Dec 2021 10:51:52 -0800
Subject: perf ftrace: Add 'latency' subcommand

The perf ftrace latency is to get a histogram of function execution
time.  Users should give a function name using -T option.

This is implemented using function_graph tracer with the given
function only.  And it parses the output to extract the time.

  $ sudo perf ftrace latency -a -T mutex_lock sleep 1
  #   DURATION     |      COUNT | GRAPH                          |
       0 - 1    us |       4596 | ########################       |
       1 - 2    us |       1680 | #########                      |
       2 - 4    us |       1106 | #####                          |
       4 - 8    us |        546 | ##                             |
       8 - 16   us |        562 | ###                            |
      16 - 32   us |          1 |                                |
      32 - 64   us |          0 |                                |
      64 - 128  us |          0 |                                |
     128 - 256  us |          0 |                                |
     256 - 512  us |          0 |                                |
     512 - 1024 us |          0 |                                |
       1 - 2    ms |          0 |                                |
       2 - 4    ms |          0 |                                |
       4 - 8    ms |          0 |                                |
       8 - 16   ms |          0 |                                |
      16 - 32   ms |          0 |                                |
      32 - 64   ms |          0 |                                |
      64 - 128  ms |          0 |                                |
     128 - 256  ms |          0 |                                |
     256 - 512  ms |          0 |                                |
     512 - 1024 ms |          0 |                                |
       1 - ...   s |          0 |                                |

Committer testing:

Latency for the __handle_mm_fault kernel function, system wide for 1
second, see how one can go from the usual 'perf ftrace' output, now the
same as for the 'perf ftrace trace' subcommand, to the new 'perf ftrace
latency' subcommand:

  # perf ftrace -T __handle_mm_fault -a sleep 1 | wc -l
  709
  # perf ftrace -T __handle_mm_fault -a sleep 1 | wc -l
  510
  # perf ftrace -T __handle_mm_fault -a sleep 1 | head -20
  # tracer: function
  #
  # entries-in-buffer/entries-written: 0/0   #P:32
  #
  #           TASK-PID     CPU#     TIMESTAMP  FUNCTION
  #              | |         |         |         |
         perf-exec-1685104 [007]  90638.894613: __handle_mm_fault <-handle_mm_fault
         perf-exec-1685104 [007]  90638.894620: __handle_mm_fault <-handle_mm_fault
         perf-exec-1685104 [007]  90638.894622: __handle_mm_fault <-handle_mm_fault
         perf-exec-1685104 [007]  90638.894635: __handle_mm_fault <-handle_mm_fault
         perf-exec-1685104 [007]  90638.894688: __handle_mm_fault <-handle_mm_fault
         perf-exec-1685104 [007]  90638.894702: __handle_mm_fault <-handle_mm_fault
         perf-exec-1685104 [007]  90638.894714: __handle_mm_fault <-handle_mm_fault
         perf-exec-1685104 [007]  90638.894728: __handle_mm_fault <-handle_mm_fault
         perf-exec-1685104 [007]  90638.894740: __handle_mm_fault <-handle_mm_fault
         perf-exec-1685104 [007]  90638.894751: __handle_mm_fault <-handle_mm_fault
             sleep-1685104 [007]  90638.894962: __handle_mm_fault <-handle_mm_fault
             sleep-1685104 [007]  90638.894977: __handle_mm_fault <-handle_mm_fault
             sleep-1685104 [007]  90638.894983: __handle_mm_fault <-handle_mm_fault
             sleep-1685104 [007]  90638.894995: __handle_mm_fault <-handle_mm_fault
  # perf ftrace latency -T __handle_mm_fault -a sleep 1
  #   DURATION     |      COUNT | GRAPH                                          |
       0 - 1    us |        125 | ######                                         |
       1 - 2    us |        249 | #############                                  |
       2 - 4    us |        455 | ########################                       |
       4 - 8    us |         37 | #                                              |
       8 - 16   us |          0 |                                                |
      16 - 32   us |          0 |                                                |
      32 - 64   us |          0 |                                                |
      64 - 128  us |          0 |                                                |
     128 - 256  us |          0 |                                                |
     256 - 512  us |          0 |                                                |
     512 - 1024 us |          0 |                                                |
       1 - 2    ms |          0 |                                                |
       2 - 4    ms |          0 |                                                |
       4 - 8    ms |          0 |                                                |
       8 - 16   ms |          0 |                                                |
      16 - 32   ms |          0 |                                                |
      32 - 64   ms |          0 |                                                |
      64 - 128  ms |          0 |                                                |
     128 - 256  ms |          0 |                                                |
     256 - 512  ms |          0 |                                                |
     512 - 1024 ms |          0 |                                                |
       1 - ...   s |          0 |                                                |
  #

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com>
Cc: Changbin Du <changbin.du@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20211215185154.360314-4-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-ftrace.c | 285 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 276 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c
index 0f8310bd0e6c..8fd3c9c44c69 100644
--- a/tools/perf/builtin-ftrace.c
+++ b/tools/perf/builtin-ftrace.c
@@ -13,7 +13,9 @@
 #include <signal.h>
 #include <stdlib.h>
 #include <fcntl.h>
+#include <math.h>
 #include <poll.h>
+#include <ctype.h>
 #include <linux/capability.h>
 #include <linux/string.h>
 
@@ -702,6 +704,224 @@ out:
 	return (done && !workload_exec_errno) ? 0 : -1;
 }
 
+#define NUM_BUCKET  22  /* 20 + 2 (for outliers in both direction) */
+
+static void make_histogram(int buckets[], char *buf, size_t len, char *linebuf)
+{
+	char *p, *q;
+	char *unit;
+	double num;
+	int i;
+
+	/* ensure NUL termination */
+	buf[len] = '\0';
+
+	/* handle data line by line */
+	for (p = buf; (q = strchr(p, '\n')) != NULL; p = q + 1) {
+		*q = '\0';
+		/* move it to the line buffer */
+		strcat(linebuf, p);
+
+		/*
+		 * parse trace output to get function duration like in
+		 *
+		 * # tracer: function_graph
+		 * #
+		 * # CPU  DURATION                  FUNCTION CALLS
+		 * # |     |   |                     |   |   |   |
+		 *  1) + 10.291 us   |  do_filp_open();
+		 *  1)   4.889 us    |  do_filp_open();
+		 *  1)   6.086 us    |  do_filp_open();
+		 *
+		 */
+		if (linebuf[0] == '#')
+			goto next;
+
+		/* ignore CPU */
+		p = strchr(linebuf, ')');
+		if (p == NULL)
+			p = linebuf;
+
+		while (*p && !isdigit(*p) && (*p != '|'))
+			p++;
+
+		/* no duration */
+		if (*p == '\0' || *p == '|')
+			goto next;
+
+		num = strtod(p, &unit);
+		if (!unit || strncmp(unit, " us", 3))
+			goto next;
+
+		i = log2(num);
+		if (i < 0)
+			i = 0;
+		if (i >= NUM_BUCKET)
+			i = NUM_BUCKET - 1;
+
+		buckets[i]++;
+
+next:
+		/* empty the line buffer for the next output  */
+		linebuf[0] = '\0';
+	}
+
+	/* preserve any remaining output (before newline) */
+	strcat(linebuf, p);
+}
+
+static void display_histogram(int buckets[])
+{
+	int i;
+	int total = 0;
+	int bar_total = 46;  /* to fit in 80 column */
+	char bar[] = "###############################################";
+	int bar_len;
+
+	for (i = 0; i < NUM_BUCKET; i++)
+		total += buckets[i];
+
+	if (total == 0) {
+		printf("No data found\n");
+		return;
+	}
+
+	printf("# %14s | %10s | %-*s |\n",
+	       "  DURATION    ", "COUNT", bar_total, "GRAPH");
+
+	bar_len = buckets[0] * bar_total / total;
+	printf("  %4d - %-4d %s | %10d | %.*s%*s |\n",
+	       0, 1, "us", buckets[0], bar_len, bar, bar_total - bar_len, "");
+
+	for (i = 1; i < NUM_BUCKET - 1; i++) {
+		int start = (1 << (i - 1));
+		int stop = 1 << i;
+		const char *unit = "us";
+
+		if (start >= 1024) {
+			start >>= 10;
+			stop >>= 10;
+			unit = "ms";
+		}
+		bar_len = buckets[i] * bar_total / total;
+		printf("  %4d - %-4d %s | %10d | %.*s%*s |\n",
+		       start, stop, unit, buckets[i], bar_len, bar,
+		       bar_total - bar_len, "");
+	}
+
+	bar_len = buckets[NUM_BUCKET - 1] * bar_total / total;
+	printf("  %4d - %-4s %s | %10d | %.*s%*s |\n",
+	       1, "...", " s", buckets[NUM_BUCKET - 1], bar_len, bar,
+	       bar_total - bar_len, "");
+
+}
+
+static int __cmd_latency(struct perf_ftrace *ftrace)
+{
+	char *trace_file;
+	int trace_fd;
+	char buf[4096];
+	char line[256];
+	struct pollfd pollfd = {
+		.events = POLLIN,
+	};
+	int buckets[NUM_BUCKET] = { };
+
+	if (!(perf_cap__capable(CAP_PERFMON) ||
+	      perf_cap__capable(CAP_SYS_ADMIN))) {
+		pr_err("ftrace only works for %s!\n",
+#ifdef HAVE_LIBCAP_SUPPORT
+		"users with the CAP_PERFMON or CAP_SYS_ADMIN capability"
+#else
+		"root"
+#endif
+		);
+		return -1;
+	}
+
+	if (reset_tracing_files(ftrace) < 0) {
+		pr_err("failed to reset ftrace\n");
+		goto out;
+	}
+
+	/* reset ftrace buffer */
+	if (write_tracing_file("trace", "0") < 0)
+		goto out;
+
+	if (set_tracing_options(ftrace) < 0)
+		goto out_reset;
+
+	/* force to use the function_graph tracer to track duration */
+	if (write_tracing_file("current_tracer", "function_graph") < 0) {
+		pr_err("failed to set current_tracer to function_graph\n");
+		goto out_reset;
+	}
+
+	trace_file = get_tracing_file("trace_pipe");
+	if (!trace_file) {
+		pr_err("failed to open trace_pipe\n");
+		goto out_reset;
+	}
+
+	trace_fd = open(trace_file, O_RDONLY);
+
+	put_tracing_file(trace_file);
+
+	if (trace_fd < 0) {
+		pr_err("failed to open trace_pipe\n");
+		goto out_reset;
+	}
+
+	fcntl(trace_fd, F_SETFL, O_NONBLOCK);
+	pollfd.fd = trace_fd;
+
+	if (write_tracing_file("tracing_on", "1") < 0) {
+		pr_err("can't enable tracing\n");
+		goto out_close_fd;
+	}
+
+	evlist__start_workload(ftrace->evlist);
+
+	line[0] = '\0';
+	while (!done) {
+		if (poll(&pollfd, 1, -1) < 0)
+			break;
+
+		if (pollfd.revents & POLLIN) {
+			int n = read(trace_fd, buf, sizeof(buf) - 1);
+			if (n < 0)
+				break;
+
+			make_histogram(buckets, buf, n, line);
+		}
+	}
+
+	write_tracing_file("tracing_on", "0");
+
+	if (workload_exec_errno) {
+		const char *emsg = str_error_r(workload_exec_errno, buf, sizeof(buf));
+		pr_err("workload failed: %s\n", emsg);
+		goto out_close_fd;
+	}
+
+	/* read remaining buffer contents */
+	while (true) {
+		int n = read(trace_fd, buf, sizeof(buf) - 1);
+		if (n <= 0)
+			break;
+		make_histogram(buckets, buf, n, line);
+	}
+
+	display_histogram(buckets);
+
+out_close_fd:
+	close(trace_fd);
+out_reset:
+	reset_tracing_files(ftrace);
+out:
+	return (done && !workload_exec_errno) ? 0 : -1;
+}
+
 static int perf_ftrace_config(const char *var, const char *value, void *cb)
 {
 	struct perf_ftrace *ftrace = cb;
@@ -864,6 +1084,12 @@ static int parse_graph_tracer_opts(const struct option *opt,
 	return 0;
 }
 
+enum perf_ftrace_subcommand {
+	PERF_FTRACE_NONE,
+	PERF_FTRACE_TRACE,
+	PERF_FTRACE_LATENCY,
+};
+
 int cmd_ftrace(int argc, const char **argv)
 {
 	int ret;
@@ -915,14 +1141,21 @@ int cmd_ftrace(int argc, const char **argv)
 		     "Number of milliseconds to wait before starting tracing after program start"),
 	OPT_PARENT(common_options),
 	};
+	const struct option latency_options[] = {
+	OPT_CALLBACK('T', "trace-funcs", &ftrace.filters, "func",
+		     "Show latency of given function", parse_filter_func),
+	OPT_PARENT(common_options),
+	};
+	const struct option *options = ftrace_options;
 
 	const char * const ftrace_usage[] = {
 		"perf ftrace [<options>] [<command>]",
 		"perf ftrace [<options>] -- [<command>] [<options>]",
-		"perf ftrace trace [<options>] [<command>]",
-		"perf ftrace trace [<options>] -- [<command>] [<options>]",
+		"perf ftrace {trace|latency} [<options>] [<command>]",
+		"perf ftrace {trace|latency} [<options>] -- [<command>] [<options>]",
 		NULL
 	};
+	enum perf_ftrace_subcommand subcmd = PERF_FTRACE_NONE;
 
 	INIT_LIST_HEAD(&ftrace.filters);
 	INIT_LIST_HEAD(&ftrace.notrace);
@@ -938,15 +1171,29 @@ int cmd_ftrace(int argc, const char **argv)
 	if (ret < 0)
 		return -1;
 
-	if (argc > 1 && !strcmp(argv[1], "trace")) {
-		argc--;
-		argv++;
+	if (argc > 1) {
+		if (!strcmp(argv[1], "trace")) {
+			subcmd = PERF_FTRACE_TRACE;
+		} else if (!strcmp(argv[1], "latency")) {
+			subcmd = PERF_FTRACE_LATENCY;
+			options = latency_options;
+		}
+
+		if (subcmd != PERF_FTRACE_NONE) {
+			argc--;
+			argv++;
+		}
 	}
+	/* for backward compatibility */
+	if (subcmd == PERF_FTRACE_NONE)
+		subcmd = PERF_FTRACE_TRACE;
 
-	argc = parse_options(argc, argv, ftrace_options, ftrace_usage,
+	argc = parse_options(argc, argv, options, ftrace_usage,
 			    PARSE_OPT_STOP_AT_NON_OPTION);
-	if (!argc && target__none(&ftrace.target))
-		ftrace.target.system_wide = true;
+	if (argc < 0) {
+		ret = -EINVAL;
+		goto out_delete_filters;
+	}
 
 	ret = target__validate(&ftrace.target);
 	if (ret) {
@@ -975,7 +1222,27 @@ int cmd_ftrace(int argc, const char **argv)
 			goto out_delete_evlist;
 	}
 
-	ret = __cmd_ftrace(&ftrace);
+	switch (subcmd) {
+	case PERF_FTRACE_TRACE:
+		if (!argc && target__none(&ftrace.target))
+			ftrace.target.system_wide = true;
+		ret = __cmd_ftrace(&ftrace);
+		break;
+	case PERF_FTRACE_LATENCY:
+		if (list_empty(&ftrace.filters)) {
+			pr_err("Should provide a function to measure\n");
+			parse_options_usage(ftrace_usage, options, "T", 1);
+			ret = -EINVAL;
+			goto out_delete_evlist;
+		}
+		ret = __cmd_latency(&ftrace);
+		break;
+	case PERF_FTRACE_NONE:
+	default:
+		pr_err("Invalid subcommand\n");
+		ret = -EINVAL;
+		break;
+	}
 
 out_delete_evlist:
 	evlist__delete(ftrace.evlist);
-- 
cgit v1.2.3


From 177f4eac7fb7fe5c70fef30dd6c4ef8f81cf7776 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 15 Dec 2021 10:51:53 -0800
Subject: perf ftrace: Add -b/--use-bpf option for latency subcommand

The -b/--use-bpf option is to use BPF to get latency info of kernel
functions.  It'd have better performance impact and I observed that
latency of same function is smaller than before when using BPF.

Committer testing:

  # strace -e bpf perf ftrace latency -b -T __handle_mm_fault -a sleep 1
  bpf(BPF_PROG_LOAD, {prog_type=BPF_PROG_TYPE_SOCKET_FILTER, insn_cnt=2, insns=0x7fff51914e00, license="GPL", log_level=0, log_size=0, log_buf=NULL, kern_version=KERNEL_VERSION(0, 0, 0), prog_flags=0, prog_name="", prog_ifindex=0, expected_attach_type=BPF_CGROUP_INET_INGRESS, prog_btf_fd=0, func_info_rec_size=0, func_info=NULL, func_info_cnt=0, line_info_rec_size=0, line_info=NULL, line_info_cnt=0, attach_btf_id=0, attach_prog_fd=0}, 128) = 3
  bpf(BPF_BTF_LOAD, {btf="\237\353\1\0\30\0\0\0\0\0\0\0\20\0\0\0\20\0\0\0\5\0\0\0\1\0\0\0\0\0\0\1"..., btf_log_buf=NULL, btf_size=45, btf_log_size=0, btf_log_level=0}, 128) = 3
  bpf(BPF_BTF_LOAD, {btf="\237\353\1\0\30\0\0\0\0\0\0\0000\0\0\0000\0\0\0\t\0\0\0\1\0\0\0\0\0\0\1"..., btf_log_buf=NULL, btf_size=81, btf_log_size=0, btf_log_level=0}, 128) = 3
  bpf(BPF_BTF_LOAD, {btf="\237\353\1\0\30\0\0\0\0\0\0\08\0\0\08\0\0\0\t\0\0\0\0\0\0\0\0\0\0\1"..., btf_log_buf=NULL, btf_size=89, btf_log_size=0, btf_log_level=0}, 128) = 3
  bpf(BPF_BTF_LOAD, {btf="\237\353\1\0\30\0\0\0\0\0\0\0\f\0\0\0\f\0\0\0\7\0\0\0\1\0\0\0\0\0\0\20"..., btf_log_buf=NULL, btf_size=43, btf_log_size=0, btf_log_level=0}, 128) = 3
  bpf(BPF_BTF_LOAD, {btf="\237\353\1\0\30\0\0\0\0\0\0\0000\0\0\0000\0\0\0\t\0\0\0\1\0\0\0\0\0\0\1"..., btf_log_buf=NULL, btf_size=81, btf_log_size=0, btf_log_level=0}, 128) = 3
  bpf(BPF_BTF_LOAD, {btf="\237\353\1\0\30\0\0\0\0\0\0\0000\0\0\0000\0\0\0\5\0\0\0\0\0\0\0\0\0\0\1"..., btf_log_buf=NULL, btf_size=77, btf_log_size=0, btf_log_level=0}, 128) = -1 EINVAL (Invalid argument)
  bpf(BPF_BTF_LOAD, {btf="\237\353\1\0\30\0\0\0\0\0\0\0\350\2\0\0\350\2\0\0\353\2\0\0\0\0\0\0\0\0\0\2"..., btf_log_buf=NULL, btf_size=1515, btf_log_size=0, btf_log_level=0}, 128) = 3
  bpf(BPF_MAP_CREATE, {map_type=BPF_MAP_TYPE_ARRAY, key_size=4, value_size=32, max_entries=1, map_flags=0, inner_map_fd=0, map_name="", map_ifindex=0, btf_fd=0, btf_key_type_id=0, btf_value_type_id=0, btf_vmlinux_value_type_id=0}, 128) = 4
  bpf(BPF_PROG_LOAD, {prog_type=BPF_PROG_TYPE_SOCKET_FILTER, insn_cnt=5, insns=0x7fff51914c30, license="GPL", log_level=0, log_size=0, log_buf=NULL, kern_version=KERNEL_VERSION(0, 0, 0), prog_flags=0, prog_name="", prog_ifindex=0, expected_attach_type=BPF_CGROUP_INET_INGRESS, prog_btf_fd=0, func_info_rec_size=0, func_info=NULL, func_info_cnt=0, line_info_rec_size=0, line_info=NULL, line_info_cnt=0, attach_btf_id=0, attach_prog_fd=0}, 128) = 5
  bpf(BPF_MAP_CREATE, {map_type=BPF_MAP_TYPE_ARRAY, key_size=4, value_size=4, max_entries=1, map_flags=BPF_F_MMAPABLE, inner_map_fd=0, map_name="", map_ifindex=0, btf_fd=0, btf_key_type_id=0, btf_value_type_id=0, btf_vmlinux_value_type_id=0}, 128) = 4
  bpf(BPF_PROG_LOAD, {prog_type=BPF_PROG_TYPE_SOCKET_FILTER, insn_cnt=2, insns=0x7fff51914a80, license="GPL", log_level=0, log_size=0, log_buf=NULL, kern_version=KERNEL_VERSION(0, 0, 0), prog_flags=0, prog_name="test", prog_ifindex=0, expected_attach_type=BPF_CGROUP_INET_INGRESS, prog_btf_fd=0, func_info_rec_size=0, func_info=NULL, func_info_cnt=0, line_info_rec_size=0, line_info=NULL, line_info_cnt=0, attach_btf_id=0, attach_prog_fd=0}, 128) = 4
  bpf(BPF_MAP_CREATE, {map_type=BPF_MAP_TYPE_HASH, key_size=8, value_size=8, max_entries=10000, map_flags=0, inner_map_fd=0, map_name="functime", map_ifindex=0, btf_fd=3, btf_key_type_id=0, btf_value_type_id=0, btf_vmlinux_value_type_id=0}, 128) = 4
  bpf(BPF_MAP_CREATE, {map_type=BPF_MAP_TYPE_HASH, key_size=4, value_size=1, max_entries=1, map_flags=0, inner_map_fd=0, map_name="cpu_filter", map_ifindex=0, btf_fd=3, btf_key_type_id=0, btf_value_type_id=0, btf_vmlinux_value_type_id=0}, 128) = 5
  bpf(BPF_MAP_CREATE, {map_type=BPF_MAP_TYPE_HASH, key_size=4, value_size=1, max_entries=1, map_flags=0, inner_map_fd=0, map_name="task_filter", map_ifindex=0, btf_fd=3, btf_key_type_id=0, btf_value_type_id=0, btf_vmlinux_value_type_id=0}, 128) = 7
  bpf(BPF_MAP_CREATE, {map_type=BPF_MAP_TYPE_PERCPU_ARRAY, key_size=4, value_size=8, max_entries=22, map_flags=0, inner_map_fd=0, map_name="latency", map_ifindex=0, btf_fd=3, btf_key_type_id=0, btf_value_type_id=0, btf_vmlinux_value_type_id=0}, 128) = 8
  bpf(BPF_MAP_CREATE, {map_type=BPF_MAP_TYPE_ARRAY, key_size=4, value_size=4, max_entries=1, map_flags=BPF_F_MMAPABLE, inner_map_fd=0, map_name="func_lat.bss", map_ifindex=0, btf_fd=3, btf_key_type_id=0, btf_value_type_id=30, btf_vmlinux_value_type_id=0}, 128) = 9
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=9, key=0x7fff51914c40, value=0x7f6e99be2000, flags=BPF_ANY}, 128) = 0
  bpf(BPF_PROG_LOAD, {prog_type=BPF_PROG_TYPE_KPROBE, insn_cnt=18, insns=0x11e4160, license="", log_level=0, log_size=0, log_buf=NULL, kern_version=KERNEL_VERSION(5, 14, 16), prog_flags=0, prog_name="func_begin", prog_ifindex=0, expected_attach_type=BPF_CGROUP_INET_INGRESS, prog_btf_fd=3, func_info_rec_size=8, func_info=0x11dfc50, func_info_cnt=1, line_info_rec_size=16, line_info=0x11e04c0, line_info_cnt=9, attach_btf_id=0, attach_prog_fd=0}, 128) = 10
  bpf(BPF_PROG_LOAD, {prog_type=BPF_PROG_TYPE_KPROBE, insn_cnt=99, insns=0x11ded70, license="", log_level=0, log_size=0, log_buf=NULL, kern_version=KERNEL_VERSION(5, 14, 16), prog_flags=0, prog_name="func_end", prog_ifindex=0, expected_attach_type=BPF_CGROUP_INET_INGRESS, prog_btf_fd=3, func_info_rec_size=8, func_info=0x11dfc70, func_info_cnt=1, line_info_rec_size=16, line_info=0x11f6e10, line_info_cnt=20, attach_btf_id=0, attach_prog_fd=0}, 128) = 11
  bpf(BPF_PROG_LOAD, {prog_type=BPF_PROG_TYPE_TRACEPOINT, insn_cnt=2, insns=0x7fff51914a80, license="GPL", log_level=0, log_size=0, log_buf=NULL, kern_version=KERNEL_VERSION(0, 0, 0), prog_flags=0, prog_name="", prog_ifindex=0, expected_attach_type=BPF_CGROUP_INET_INGRESS, prog_btf_fd=0, func_info_rec_size=0, func_info=NULL, func_info_cnt=0, line_info_rec_size=0, line_info=NULL, line_info_cnt=0, attach_btf_id=0, attach_prog_fd=0}, 128) = 13
  bpf(BPF_LINK_CREATE, {link_create={prog_fd=13, target_fd=-1, attach_type=0x29 /* BPF_??? */, flags=0}}, 128) = -1 EINVAL (Invalid argument)
  --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=1699992, si_uid=0, si_status=0, si_utime=0, si_stime=0} ---
  bpf(BPF_MAP_LOOKUP_ELEM, {map_fd=8, key=0x7fff51914f84, value=0x11f6fa0, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_LOOKUP_ELEM, {map_fd=8, key=0x7fff51914f84, value=0x11f6fa0, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_LOOKUP_ELEM, {map_fd=8, key=0x7fff51914f84, value=0x11f6fa0, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_LOOKUP_ELEM, {map_fd=8, key=0x7fff51914f84, value=0x11f6fa0, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_LOOKUP_ELEM, {map_fd=8, key=0x7fff51914f84, value=0x11f6fa0, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_LOOKUP_ELEM, {map_fd=8, key=0x7fff51914f84, value=0x11f6fa0, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_LOOKUP_ELEM, {map_fd=8, key=0x7fff51914f84, value=0x11f6fa0, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_LOOKUP_ELEM, {map_fd=8, key=0x7fff51914f84, value=0x11f6fa0, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_LOOKUP_ELEM, {map_fd=8, key=0x7fff51914f84, value=0x11f6fa0, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_LOOKUP_ELEM, {map_fd=8, key=0x7fff51914f84, value=0x11f6fa0, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_LOOKUP_ELEM, {map_fd=8, key=0x7fff51914f84, value=0x11f6fa0, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_LOOKUP_ELEM, {map_fd=8, key=0x7fff51914f84, value=0x11f6fa0, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_LOOKUP_ELEM, {map_fd=8, key=0x7fff51914f84, value=0x11f6fa0, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_LOOKUP_ELEM, {map_fd=8, key=0x7fff51914f84, value=0x11f6fa0, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_LOOKUP_ELEM, {map_fd=8, key=0x7fff51914f84, value=0x11f6fa0, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_LOOKUP_ELEM, {map_fd=8, key=0x7fff51914f84, value=0x11f6fa0, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_LOOKUP_ELEM, {map_fd=8, key=0x7fff51914f84, value=0x11f6fa0, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_LOOKUP_ELEM, {map_fd=8, key=0x7fff51914f84, value=0x11f6fa0, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_LOOKUP_ELEM, {map_fd=8, key=0x7fff51914f84, value=0x11f6fa0, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_LOOKUP_ELEM, {map_fd=8, key=0x7fff51914f84, value=0x11f6fa0, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_LOOKUP_ELEM, {map_fd=8, key=0x7fff51914f84, value=0x11f6fa0, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_LOOKUP_ELEM, {map_fd=8, key=0x7fff51914f84, value=0x11f6fa0, flags=BPF_ANY}, 128) = 0
  #   DURATION     |      COUNT | GRAPH                                          |
       0 - 1    us |         52 | ###################                            |
       1 - 2    us |         36 | #############                                  |
       2 - 4    us |         24 | #########                                      |
       4 - 8    us |          7 | ##                                             |
       8 - 16   us |          1 |                                                |
      16 - 32   us |          0 |                                                |
      32 - 64   us |          0 |                                                |
      64 - 128  us |          0 |                                                |
     128 - 256  us |          0 |                                                |
     256 - 512  us |          0 |                                                |
     512 - 1024 us |          0 |                                                |
       1 - 2    ms |          0 |                                                |
       2 - 4    ms |          0 |                                                |
       4 - 8    ms |          0 |                                                |
       8 - 16   ms |          0 |                                                |
      16 - 32   ms |          0 |                                                |
      32 - 64   ms |          0 |                                                |
      64 - 128  ms |          0 |                                                |
     128 - 256  ms |          0 |                                                |
     256 - 512  ms |          0 |                                                |
     512 - 1024 ms |          0 |                                                |
       1 - ...   s |          0 |                                                |
  +++ exited with 0 +++
  #

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com>
Cc: Changbin Du <changbin.du@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20211215185154.360314-5-namhyung@kernel.org
[ Add missing util/cpumap.h include and removed unused 'fd' variable ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile.perf                    |   2 +-
 tools/perf/builtin-ftrace.c                 | 158 ++++++++++++++++------------
 tools/perf/util/Build                       |   1 +
 tools/perf/util/bpf_ftrace.c                | 112 ++++++++++++++++++++
 tools/perf/util/bpf_skel/func_latency.bpf.c |  93 ++++++++++++++++
 tools/perf/util/ftrace.h                    |  81 ++++++++++++++
 6 files changed, 380 insertions(+), 67 deletions(-)
 create mode 100644 tools/perf/util/bpf_ftrace.c
 create mode 100644 tools/perf/util/bpf_skel/func_latency.bpf.c
 create mode 100644 tools/perf/util/ftrace.h

(limited to 'tools')

diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 164a37523781..ac861e42c8f7 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -1041,7 +1041,7 @@ SKEL_OUT := $(abspath $(OUTPUT)util/bpf_skel)
 SKEL_TMP_OUT := $(abspath $(SKEL_OUT)/.tmp)
 SKELETONS := $(SKEL_OUT)/bpf_prog_profiler.skel.h
 SKELETONS += $(SKEL_OUT)/bperf_leader.skel.h $(SKEL_OUT)/bperf_follower.skel.h
-SKELETONS += $(SKEL_OUT)/bperf_cgroup.skel.h
+SKELETONS += $(SKEL_OUT)/bperf_cgroup.skel.h $(SKEL_OUT)/func_latency.skel.h
 
 $(SKEL_TMP_OUT) $(LIBBPF_OUTPUT):
 	$(Q)$(MKDIR) -p $@
diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c
index 8fd3c9c44c69..2b54e2ddc80a 100644
--- a/tools/perf/builtin-ftrace.c
+++ b/tools/perf/builtin-ftrace.c
@@ -30,36 +30,12 @@
 #include "strfilter.h"
 #include "util/cap.h"
 #include "util/config.h"
+#include "util/ftrace.h"
 #include "util/units.h"
 #include "util/parse-sublevel-options.h"
 
 #define DEFAULT_TRACER  "function_graph"
 
-struct perf_ftrace {
-	struct evlist		*evlist;
-	struct target		target;
-	const char		*tracer;
-	struct list_head	filters;
-	struct list_head	notrace;
-	struct list_head	graph_funcs;
-	struct list_head	nograph_funcs;
-	int			graph_depth;
-	unsigned long		percpu_buffer_size;
-	bool			inherit;
-	int			func_stack_trace;
-	int			func_irq_info;
-	int			graph_nosleep_time;
-	int			graph_noirqs;
-	int			graph_verbose;
-	int			graph_thresh;
-	unsigned int		initial_delay;
-};
-
-struct filter_entry {
-	struct list_head	list;
-	char			name[];
-};
-
 static volatile int workload_exec_errno;
 static bool done;
 
@@ -704,8 +680,6 @@ out:
 	return (done && !workload_exec_errno) ? 0 : -1;
 }
 
-#define NUM_BUCKET  22  /* 20 + 2 (for outliers in both direction) */
-
 static void make_histogram(int buckets[], char *buf, size_t len, char *linebuf)
 {
 	char *p, *q;
@@ -816,69 +790,116 @@ static void display_histogram(int buckets[])
 
 }
 
-static int __cmd_latency(struct perf_ftrace *ftrace)
+static int prepare_func_latency(struct perf_ftrace *ftrace)
 {
 	char *trace_file;
-	int trace_fd;
-	char buf[4096];
-	char line[256];
-	struct pollfd pollfd = {
-		.events = POLLIN,
-	};
-	int buckets[NUM_BUCKET] = { };
+	int fd;
 
-	if (!(perf_cap__capable(CAP_PERFMON) ||
-	      perf_cap__capable(CAP_SYS_ADMIN))) {
-		pr_err("ftrace only works for %s!\n",
-#ifdef HAVE_LIBCAP_SUPPORT
-		"users with the CAP_PERFMON or CAP_SYS_ADMIN capability"
-#else
-		"root"
-#endif
-		);
-		return -1;
-	}
+	if (ftrace->target.use_bpf)
+		return perf_ftrace__latency_prepare_bpf(ftrace);
 
 	if (reset_tracing_files(ftrace) < 0) {
 		pr_err("failed to reset ftrace\n");
-		goto out;
+		return -1;
 	}
 
 	/* reset ftrace buffer */
 	if (write_tracing_file("trace", "0") < 0)
-		goto out;
+		return -1;
 
 	if (set_tracing_options(ftrace) < 0)
-		goto out_reset;
+		return -1;
 
 	/* force to use the function_graph tracer to track duration */
 	if (write_tracing_file("current_tracer", "function_graph") < 0) {
 		pr_err("failed to set current_tracer to function_graph\n");
-		goto out_reset;
+		return -1;
 	}
 
 	trace_file = get_tracing_file("trace_pipe");
 	if (!trace_file) {
 		pr_err("failed to open trace_pipe\n");
-		goto out_reset;
+		return -1;
 	}
 
-	trace_fd = open(trace_file, O_RDONLY);
+	fd = open(trace_file, O_RDONLY);
+	if (fd < 0)
+		pr_err("failed to open trace_pipe\n");
 
 	put_tracing_file(trace_file);
+	return fd;
+}
 
-	if (trace_fd < 0) {
-		pr_err("failed to open trace_pipe\n");
-		goto out_reset;
+static int start_func_latency(struct perf_ftrace *ftrace)
+{
+	if (ftrace->target.use_bpf)
+		return perf_ftrace__latency_start_bpf(ftrace);
+
+	if (write_tracing_file("tracing_on", "1") < 0) {
+		pr_err("can't enable tracing\n");
+		return -1;
 	}
 
+	return 0;
+}
+
+static int stop_func_latency(struct perf_ftrace *ftrace)
+{
+	if (ftrace->target.use_bpf)
+		return perf_ftrace__latency_stop_bpf(ftrace);
+
+	write_tracing_file("tracing_on", "0");
+	return 0;
+}
+
+static int read_func_latency(struct perf_ftrace *ftrace, int buckets[])
+{
+	if (ftrace->target.use_bpf)
+		return perf_ftrace__latency_read_bpf(ftrace, buckets);
+
+	return 0;
+}
+
+static int cleanup_func_latency(struct perf_ftrace *ftrace)
+{
+	if (ftrace->target.use_bpf)
+		return perf_ftrace__latency_cleanup_bpf(ftrace);
+
+	reset_tracing_files(ftrace);
+	return 0;
+}
+
+static int __cmd_latency(struct perf_ftrace *ftrace)
+{
+	int trace_fd;
+	char buf[4096];
+	char line[256];
+	struct pollfd pollfd = {
+		.events = POLLIN,
+	};
+	int buckets[NUM_BUCKET] = { };
+
+	if (!(perf_cap__capable(CAP_PERFMON) ||
+	      perf_cap__capable(CAP_SYS_ADMIN))) {
+		pr_err("ftrace only works for %s!\n",
+#ifdef HAVE_LIBCAP_SUPPORT
+		"users with the CAP_PERFMON or CAP_SYS_ADMIN capability"
+#else
+		"root"
+#endif
+		);
+		return -1;
+	}
+
+	trace_fd = prepare_func_latency(ftrace);
+	if (trace_fd < 0)
+		goto out;
+
 	fcntl(trace_fd, F_SETFL, O_NONBLOCK);
 	pollfd.fd = trace_fd;
 
-	if (write_tracing_file("tracing_on", "1") < 0) {
-		pr_err("can't enable tracing\n");
-		goto out_close_fd;
-	}
+	if (start_func_latency(ftrace) < 0)
+		goto out;
 
 	evlist__start_workload(ftrace->evlist);
 
@@ -896,29 +917,30 @@ static int __cmd_latency(struct perf_ftrace *ftrace)
 		}
 	}
 
-	write_tracing_file("tracing_on", "0");
+	stop_func_latency(ftrace);
 
 	if (workload_exec_errno) {
 		const char *emsg = str_error_r(workload_exec_errno, buf, sizeof(buf));
 		pr_err("workload failed: %s\n", emsg);
-		goto out_close_fd;
+		goto out;
 	}
 
 	/* read remaining buffer contents */
-	while (true) {
+	while (!ftrace->target.use_bpf) {
 		int n = read(trace_fd, buf, sizeof(buf) - 1);
 		if (n <= 0)
 			break;
 		make_histogram(buckets, buf, n, line);
 	}
 
+	read_func_latency(ftrace, buckets);
+
 	display_histogram(buckets);
 
-out_close_fd:
-	close(trace_fd);
-out_reset:
-	reset_tracing_files(ftrace);
 out:
+	close(trace_fd);
+	cleanup_func_latency(ftrace);
+
 	return (done && !workload_exec_errno) ? 0 : -1;
 }
 
@@ -1144,6 +1166,10 @@ int cmd_ftrace(int argc, const char **argv)
 	const struct option latency_options[] = {
 	OPT_CALLBACK('T', "trace-funcs", &ftrace.filters, "func",
 		     "Show latency of given function", parse_filter_func),
+#ifdef HAVE_BPF_SKEL
+	OPT_BOOLEAN('b', "use-bpf", &ftrace.target.use_bpf,
+		    "Use BPF to measure function latency"),
+#endif
 	OPT_PARENT(common_options),
 	};
 	const struct option *options = ftrace_options;
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 2e5bfbb69960..294b12430d73 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -144,6 +144,7 @@ perf-$(CONFIG_LIBBPF) += bpf-loader.o
 perf-$(CONFIG_LIBBPF) += bpf_map.o
 perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter.o
 perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter_cgroup.o
+perf-$(CONFIG_PERF_BPF_SKEL) += bpf_ftrace.o
 perf-$(CONFIG_BPF_PROLOGUE) += bpf-prologue.o
 perf-$(CONFIG_LIBELF) += symbol-elf.o
 perf-$(CONFIG_LIBELF) += probe-file.o
diff --git a/tools/perf/util/bpf_ftrace.c b/tools/perf/util/bpf_ftrace.c
new file mode 100644
index 000000000000..ec4e2f5a2fc4
--- /dev/null
+++ b/tools/perf/util/bpf_ftrace.c
@@ -0,0 +1,112 @@
+#include <stdio.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <linux/err.h>
+
+#include "util/ftrace.h"
+#include "util/cpumap.h"
+#include "util/debug.h"
+#include "util/bpf_counter.h"
+
+#include "util/bpf_skel/func_latency.skel.h"
+
+static struct func_latency_bpf *skel;
+
+int perf_ftrace__latency_prepare_bpf(struct perf_ftrace *ftrace)
+{
+	int err;
+	struct filter_entry *func;
+
+	if (!list_is_singular(&ftrace->filters)) {
+		pr_err("ERROR: %s target function(s).\n",
+		       list_empty(&ftrace->filters) ? "No" : "Too many");
+		return -1;
+	}
+
+	func = list_first_entry(&ftrace->filters, struct filter_entry, list);
+
+	skel = func_latency_bpf__open();
+	if (!skel) {
+		pr_err("Failed to open func latency skeleton\n");
+		return -1;
+	}
+
+	set_max_rlimit();
+
+	err = func_latency_bpf__load(skel);
+	if (err) {
+		pr_err("Failed to load func latency skeleton\n");
+		goto out;
+	}
+
+	skel->links.func_begin = bpf_program__attach_kprobe(skel->progs.func_begin,
+							    false, func->name);
+	if (IS_ERR(skel->links.func_begin)) {
+		pr_err("Failed to attach fentry program\n");
+		err = PTR_ERR(skel->links.func_begin);
+		goto out;
+	}
+
+	skel->links.func_end = bpf_program__attach_kprobe(skel->progs.func_end,
+							  true, func->name);
+	if (IS_ERR(skel->links.func_end)) {
+		pr_err("Failed to attach fexit program\n");
+		err = PTR_ERR(skel->links.func_end);
+		goto out;
+	}
+
+	/* XXX: we don't actually use this fd - just for poll() */
+	return open("/dev/null", O_RDONLY);
+
+out:
+	return err;
+}
+
+int perf_ftrace__latency_start_bpf(struct perf_ftrace *ftrace __maybe_unused)
+{
+	skel->bss->enabled = 1;
+	return 0;
+}
+
+int perf_ftrace__latency_stop_bpf(struct perf_ftrace *ftrace __maybe_unused)
+{
+	skel->bss->enabled = 0;
+	return 0;
+}
+
+int perf_ftrace__latency_read_bpf(struct perf_ftrace *ftrace __maybe_unused,
+				  int buckets[])
+{
+	int i, fd, err;
+	u32 idx;
+	u64 *hist;
+	int ncpus = cpu__max_cpu();
+
+	fd = bpf_map__fd(skel->maps.latency);
+
+	hist = calloc(ncpus, sizeof(*hist));
+	if (hist == NULL)
+		return -ENOMEM;
+
+	for (idx = 0; idx < NUM_BUCKET; idx++) {
+		err = bpf_map_lookup_elem(fd, &idx, hist);
+		if (err) {
+			buckets[idx] = 0;
+			continue;
+		}
+
+		for (i = 0; i < ncpus; i++)
+			buckets[idx] += hist[i];
+	}
+
+	free(hist);
+	return 0;
+}
+
+int perf_ftrace__latency_cleanup_bpf(struct perf_ftrace *ftrace __maybe_unused)
+{
+	func_latency_bpf__destroy(skel);
+	return 0;
+}
diff --git a/tools/perf/util/bpf_skel/func_latency.bpf.c b/tools/perf/util/bpf_skel/func_latency.bpf.c
new file mode 100644
index 000000000000..ccd96b09fc42
--- /dev/null
+++ b/tools/perf/util/bpf_skel/func_latency.bpf.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2021 Google
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+// This should be in sync with "util/ftrace.h"
+#define NUM_BUCKET  22
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(key_size, sizeof(__u64));
+	__uint(value_size, sizeof(__u64));
+	__uint(max_entries, 10000);
+} functime SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(__u8));
+	__uint(max_entries, 1);
+} cpu_filter SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(__u8));
+	__uint(max_entries, 1);
+} task_filter SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(__u64));
+	__uint(max_entries, NUM_BUCKET);
+} latency SEC(".maps");
+
+
+int enabled = 0;
+
+SEC("kprobe/func")
+int BPF_PROG(func_begin)
+{
+	__u64 key, now;
+
+	if (!enabled)
+		return 0;
+
+	key = bpf_get_current_pid_tgid();
+	now = bpf_ktime_get_ns();
+
+	// overwrite timestamp for nested functions
+	bpf_map_update_elem(&functime, &key, &now, BPF_ANY);
+	return 0;
+}
+
+SEC("kretprobe/func")
+int BPF_PROG(func_end)
+{
+	__u64 tid;
+	__u64 *start;
+
+	if (!enabled)
+		return 0;
+
+	tid = bpf_get_current_pid_tgid();
+
+	start = bpf_map_lookup_elem(&functime, &tid);
+	if (start) {
+		__s64 delta = bpf_ktime_get_ns() - *start;
+		__u32 key;
+		__u64 *hist;
+
+		bpf_map_delete_elem(&functime, &tid);
+
+		if (delta < 0)
+			return 0;
+
+		// calculate index using delta in usec
+		for (key = 0; key < (NUM_BUCKET - 1); key++) {
+			if (delta < ((1000UL) << key))
+				break;
+		}
+
+		hist = bpf_map_lookup_elem(&latency, &key);
+		if (!hist)
+			return 0;
+
+		*hist += 1;
+	}
+
+	return 0;
+}
diff --git a/tools/perf/util/ftrace.h b/tools/perf/util/ftrace.h
new file mode 100644
index 000000000000..887f68a185f7
--- /dev/null
+++ b/tools/perf/util/ftrace.h
@@ -0,0 +1,81 @@
+#ifndef __PERF_FTRACE_H__
+#define __PERF_FTRACE_H__
+
+#include <linux/list.h>
+
+#include "target.h"
+
+struct evlist;
+
+struct perf_ftrace {
+	struct evlist		*evlist;
+	struct target		target;
+	const char		*tracer;
+	struct list_head	filters;
+	struct list_head	notrace;
+	struct list_head	graph_funcs;
+	struct list_head	nograph_funcs;
+	unsigned long		percpu_buffer_size;
+	bool			inherit;
+	int			graph_depth;
+	int			func_stack_trace;
+	int			func_irq_info;
+	int			graph_nosleep_time;
+	int			graph_noirqs;
+	int			graph_verbose;
+	int			graph_thresh;
+	unsigned int		initial_delay;
+};
+
+struct filter_entry {
+	struct list_head	list;
+	char			name[];
+};
+
+#define NUM_BUCKET  22  /* 20 + 2 (for outliers in both direction) */
+
+#ifdef HAVE_BPF_SKEL
+
+int perf_ftrace__latency_prepare_bpf(struct perf_ftrace *ftrace);
+int perf_ftrace__latency_start_bpf(struct perf_ftrace *ftrace);
+int perf_ftrace__latency_stop_bpf(struct perf_ftrace *ftrace);
+int perf_ftrace__latency_read_bpf(struct perf_ftrace *ftrace,
+				  int buckets[]);
+int perf_ftrace__latency_cleanup_bpf(struct perf_ftrace *ftrace);
+
+#else  /* !HAVE_BPF_SKEL */
+
+static inline int
+perf_ftrace__latency_prepare_bpf(struct perf_ftrace *ftrace __maybe_unused)
+{
+	return -1;
+}
+
+static inline int
+perf_ftrace__latency_start_bpf(struct perf_ftrace *ftrace __maybe_unused)
+{
+	return -1;
+}
+
+static inline int
+perf_ftrace__latency_stop_bpf(struct perf_ftrace *ftrace __maybe_unused)
+{
+	return -1;
+}
+
+static inline int
+perf_ftrace__latency_read_bpf(struct perf_ftrace *ftrace __maybe_unused,
+			      int buckets[] __maybe_unused)
+{
+	return -1;
+}
+
+static inline int
+perf_ftrace__latency_cleanup_bpf(struct perf_ftrace *ftrace __maybe_unused)
+{
+	return -1;
+}
+
+#endif  /* HAVE_BPF_SKEL */
+
+#endif  /* __PERF_FTRACE_H__ */
-- 
cgit v1.2.3


From 9c5c605219578b8167b649245e00d6407f2c18da Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 15 Dec 2021 10:51:54 -0800
Subject: perf ftrace: Implement cpu and task filters in BPF

Honor cpu and task options to set up filters (by pid or tid) in the
BPF program.  For example, the following command will show latency of
the mutex_lock for process 2570.

  # perf ftrace latency -b -T mutex_lock -p 2570 sleep 3
  #   DURATION     |      COUNT | GRAPH                          |
       0 - 1    us |        675 | ############################## |
       1 - 2    us |          9 |                                |
       2 - 4    us |          0 |                                |
       4 - 8    us |          0 |                                |
       8 - 16   us |          0 |                                |
      16 - 32   us |          0 |                                |
      32 - 64   us |          0 |                                |
      64 - 128  us |          0 |                                |
     128 - 256  us |          0 |                                |
     256 - 512  us |          0 |                                |
     512 - 1024 us |          0 |                                |
       1 - 2    ms |          0 |                                |
       2 - 4    ms |          0 |                                |
       4 - 8    ms |          0 |                                |
       8 - 16   ms |          0 |                                |
      16 - 32   ms |          0 |                                |
      32 - 64   ms |          0 |                                |
      64 - 128  ms |          0 |                                |
     128 - 256  ms |          0 |                                |
     256 - 512  ms |          0 |                                |
     512 - 1024 ms |          0 |                                |
       1 - ...   s |          0 |                                |

Committer testing:

Looking at faults on a firefox process:

  # strace -e bpf perf ftrace latency -b -p 1674378 -T __handle_mm_fault
  bpf(BPF_PROG_LOAD, {prog_type=BPF_PROG_TYPE_SOCKET_FILTER, insn_cnt=2, insns=0x7ffee1fee740, license="GPL", log_level=0, log_size=0, log_buf=NULL, kern_version=KERNEL_VERSION(0, 0, 0), prog_flags=0, prog_name="", prog_ifindex=0, expected_attach_type=BPF_CGROUP_INET_INGRESS, prog_btf_fd=0, func_info_rec_size=0, func_info=NULL, func_info_cnt=0, line_info_rec_size=0, line_info=NULL, line_info_cnt=0, attach_btf_id=0, attach_prog_fd=0}, 128) = 3
  bpf(BPF_BTF_LOAD, {btf="\237\353\1\0\30\0\0\0\0\0\0\0\20\0\0\0\20\0\0\0\5\0\0\0\1\0\0\0\0\0\0\1"..., btf_log_buf=NULL, btf_size=45, btf_log_size=0, btf_log_level=0}, 128) = 3
  bpf(BPF_BTF_LOAD, {btf="\237\353\1\0\30\0\0\0\0\0\0\0000\0\0\0000\0\0\0\t\0\0\0\1\0\0\0\0\0\0\1"..., btf_log_buf=NULL, btf_size=81, btf_log_size=0, btf_log_level=0}, 128) = 3
  bpf(BPF_BTF_LOAD, {btf="\237\353\1\0\30\0\0\0\0\0\0\08\0\0\08\0\0\0\t\0\0\0\0\0\0\0\0\0\0\1"..., btf_log_buf=NULL, btf_size=89, btf_log_size=0, btf_log_level=0}, 128) = 3
  bpf(BPF_BTF_LOAD, {btf="\237\353\1\0\30\0\0\0\0\0\0\0\f\0\0\0\f\0\0\0\7\0\0\0\1\0\0\0\0\0\0\20"..., btf_log_buf=NULL, btf_size=43, btf_log_size=0, btf_log_level=0}, 128) = 3
  bpf(BPF_BTF_LOAD, {btf="\237\353\1\0\30\0\0\0\0\0\0\0000\0\0\0000\0\0\0\t\0\0\0\1\0\0\0\0\0\0\1"..., btf_log_buf=NULL, btf_size=81, btf_log_size=0, btf_log_level=0}, 128) = 3
  bpf(BPF_BTF_LOAD, {btf="\237\353\1\0\30\0\0\0\0\0\0\0000\0\0\0000\0\0\0\5\0\0\0\0\0\0\0\0\0\0\1"..., btf_log_buf=NULL, btf_size=77, btf_log_size=0, btf_log_level=0}, 128) = -1 EINVAL (Invalid argument)
  bpf(BPF_BTF_LOAD, {btf="\237\353\1\0\30\0\0\0\0\0\0\0 \3\0\0 \3\0\0\306\3\0\0\0\0\0\0\0\0\0\2"..., btf_log_buf=NULL, btf_size=1790, btf_log_size=0, btf_log_level=0}, 128) = 3
  bpf(BPF_MAP_CREATE, {map_type=BPF_MAP_TYPE_ARRAY, key_size=4, value_size=32, max_entries=1, map_flags=0, inner_map_fd=0, map_name="", map_ifindex=0, btf_fd=0, btf_key_type_id=0, btf_value_type_id=0, btf_vmlinux_value_type_id=0}, 128) = 4
  bpf(BPF_PROG_LOAD, {prog_type=BPF_PROG_TYPE_SOCKET_FILTER, insn_cnt=5, insns=0x7ffee1fee570, license="GPL", log_level=0, log_size=0, log_buf=NULL, kern_version=KERNEL_VERSION(0, 0, 0), prog_flags=0, prog_name="", prog_ifindex=0, expected_attach_type=BPF_CGROUP_INET_INGRESS, prog_btf_fd=0, func_info_rec_size=0, func_info=NULL, func_info_cnt=0, line_info_rec_size=0, line_info=NULL, line_info_cnt=0, attach_btf_id=0, attach_prog_fd=0}, 128) = 5
  bpf(BPF_MAP_CREATE, {map_type=BPF_MAP_TYPE_ARRAY, key_size=4, value_size=4, max_entries=1, map_flags=BPF_F_MMAPABLE, inner_map_fd=0, map_name="", map_ifindex=0, btf_fd=0, btf_key_type_id=0, btf_value_type_id=0, btf_vmlinux_value_type_id=0}, 128) = 4
  bpf(BPF_PROG_LOAD, {prog_type=BPF_PROG_TYPE_SOCKET_FILTER, insn_cnt=2, insns=0x7ffee1fee3c0, license="GPL", log_level=0, log_size=0, log_buf=NULL, kern_version=KERNEL_VERSION(0, 0, 0), prog_flags=0, prog_name="test", prog_ifindex=0, expected_attach_type=BPF_CGROUP_INET_INGRESS, prog_btf_fd=0, func_info_rec_size=0, func_info=NULL, func_info_cnt=0, line_info_rec_size=0, line_info=NULL, line_info_cnt=0, attach_btf_id=0, attach_prog_fd=0}, 128) = 4
  bpf(BPF_MAP_CREATE, {map_type=BPF_MAP_TYPE_HASH, key_size=8, value_size=8, max_entries=10000, map_flags=0, inner_map_fd=0, map_name="functime", map_ifindex=0, btf_fd=3, btf_key_type_id=0, btf_value_type_id=0, btf_vmlinux_value_type_id=0}, 128) = 4
  bpf(BPF_MAP_CREATE, {map_type=BPF_MAP_TYPE_HASH, key_size=4, value_size=1, max_entries=1, map_flags=0, inner_map_fd=0, map_name="cpu_filter", map_ifindex=0, btf_fd=3, btf_key_type_id=0, btf_value_type_id=0, btf_vmlinux_value_type_id=0}, 128) = 5
  bpf(BPF_MAP_CREATE, {map_type=BPF_MAP_TYPE_HASH, key_size=4, value_size=1, max_entries=36, map_flags=0, inner_map_fd=0, map_name="task_filter", map_ifindex=0, btf_fd=3, btf_key_type_id=0, btf_value_type_id=0, btf_vmlinux_value_type_id=0}, 128) = 6
  bpf(BPF_MAP_CREATE, {map_type=BPF_MAP_TYPE_PERCPU_ARRAY, key_size=4, value_size=8, max_entries=22, map_flags=0, inner_map_fd=0, map_name="latency", map_ifindex=0, btf_fd=3, btf_key_type_id=0, btf_value_type_id=0, btf_vmlinux_value_type_id=0}, 128) = 7
  bpf(BPF_MAP_CREATE, {map_type=BPF_MAP_TYPE_ARRAY, key_size=4, value_size=12, max_entries=1, map_flags=BPF_F_MMAPABLE, inner_map_fd=0, map_name="func_lat.bss", map_ifindex=0, btf_fd=3, btf_key_type_id=0, btf_value_type_id=32, btf_vmlinux_value_type_id=0}, 128) = 8
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=8, key=0x7ffee1fee580, value=0x7f01d940a000, flags=BPF_ANY}, 128) = 0
  bpf(BPF_PROG_LOAD, {prog_type=BPF_PROG_TYPE_KPROBE, insn_cnt=42, insns=0x1871f30, license="", log_level=0, log_size=0, log_buf=NULL, kern_version=KERNEL_VERSION(5, 14, 16), prog_flags=0, prog_name="func_begin", prog_ifindex=0, expected_attach_type=BPF_CGROUP_INET_INGRESS, prog_btf_fd=3, func_info_rec_size=8, func_info=0x18746a0, func_info_cnt=1, line_info_rec_size=16, line_info=0x1874550, line_info_cnt=20, attach_btf_id=0, attach_prog_fd=0}, 128) = 9
  bpf(BPF_PROG_LOAD, {prog_type=BPF_PROG_TYPE_KPROBE, insn_cnt=99, insns=0x18769b0, license="", log_level=0, log_size=0, log_buf=NULL, kern_version=KERNEL_VERSION(5, 14, 16), prog_flags=0, prog_name="func_end", prog_ifindex=0, expected_attach_type=BPF_CGROUP_INET_INGRESS, prog_btf_fd=3, func_info_rec_size=8, func_info=0x188a640, func_info_cnt=1, line_info_rec_size=16, line_info=0x188a660, line_info_cnt=20, attach_btf_id=0, attach_prog_fd=0}, 128) = 10
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_MAP_UPDATE_ELEM, {map_fd=6, key=0x7ffee1fee8e0, value=0x7ffee1fee8df, flags=BPF_ANY}, 128) = 0
  bpf(BPF_PROG_LOAD, {prog_type=BPF_PROG_TYPE_TRACEPOINT, insn_cnt=2, insns=0x7ffee1fee3c0, license="GPL", log_level=0, log_size=0, log_buf=NULL, kern_version=KERNEL_VERSION(0, 0, 0), prog_flags=0, prog_name="", prog_ifindex=0, expected_attach_type=BPF_CGROUP_INET_INGRESS, prog_btf_fd=0, func_info_rec_size=0, func_info=NULL, func_info_cnt=0, line_info_rec_size=0, line_info=NULL, line_info_cnt=0, attach_btf_id=0, attach_prog_fd=0}, 128) = 12
  bpf(BPF_LINK_CREATE, {link_create={prog_fd=12, target_fd=-1, attach_type=0x29 /* BPF_??? */, flags=0}}, 128) = -1 EINVAL (Invalid argument)
  ^Cstrace: Process 1702285 detached
  #   DURATION     |      COUNT | GRAPH                                          |
       0 - 1    us |        109 | #################                              |
       1 - 2    us |        127 | ###################                            |
       2 - 4    us |         36 | #####                                          |
       4 - 8    us |         20 | ###                                            |
       8 - 16   us |          2 |                                                |
      16 - 32   us |          0 |                                                |
      32 - 64   us |          0 |                                                |
      64 - 128  us |          0 |                                                |
     128 - 256  us |          0 |                                                |
     256 - 512  us |          0 |                                                |
     512 - 1024 us |          0 |                                                |
       1 - 2    ms |          0 |                                                |
       2 - 4    ms |          0 |                                                |
       4 - 8    ms |          0 |                                                |
       8 - 16   ms |          0 |                                                |
      16 - 32   ms |          0 |                                                |
      32 - 64   ms |          0 |                                                |
      64 - 128  ms |          0 |                                                |
     128 - 256  ms |          0 |                                                |
     256 - 512  ms |          0 |                                                |
     512 - 1024 ms |          0 |                                                |
       1 - ...   s |          0 |                                                |

  #

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com>
Cc: Changbin Du <changbin.du@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20211215185154.360314-6-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf_ftrace.c                | 42 ++++++++++++++++++++++++++++-
 tools/perf/util/bpf_skel/func_latency.bpf.c | 21 +++++++++++++++
 2 files changed, 62 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/bpf_ftrace.c b/tools/perf/util/bpf_ftrace.c
index ec4e2f5a2fc4..f00a2de6778c 100644
--- a/tools/perf/util/bpf_ftrace.c
+++ b/tools/perf/util/bpf_ftrace.c
@@ -7,7 +7,9 @@
 
 #include "util/ftrace.h"
 #include "util/cpumap.h"
+#include "util/thread_map.h"
 #include "util/debug.h"
+#include "util/evlist.h"
 #include "util/bpf_counter.h"
 
 #include "util/bpf_skel/func_latency.skel.h"
@@ -16,7 +18,8 @@ static struct func_latency_bpf *skel;
 
 int perf_ftrace__latency_prepare_bpf(struct perf_ftrace *ftrace)
 {
-	int err;
+	int fd, err;
+	int i, ncpus = 1, ntasks = 1;
 	struct filter_entry *func;
 
 	if (!list_is_singular(&ftrace->filters)) {
@@ -33,6 +36,17 @@ int perf_ftrace__latency_prepare_bpf(struct perf_ftrace *ftrace)
 		return -1;
 	}
 
+	/* don't need to set cpu filter for system-wide mode */
+	if (ftrace->target.cpu_list) {
+		ncpus = perf_cpu_map__nr(ftrace->evlist->core.cpus);
+		bpf_map__set_max_entries(skel->maps.cpu_filter, ncpus);
+	}
+
+	if (target__has_task(&ftrace->target) || target__none(&ftrace->target)) {
+		ntasks = perf_thread_map__nr(ftrace->evlist->core.threads);
+		bpf_map__set_max_entries(skel->maps.task_filter, ntasks);
+	}
+
 	set_max_rlimit();
 
 	err = func_latency_bpf__load(skel);
@@ -41,6 +55,32 @@ int perf_ftrace__latency_prepare_bpf(struct perf_ftrace *ftrace)
 		goto out;
 	}
 
+	if (ftrace->target.cpu_list) {
+		u32 cpu;
+		u8 val = 1;
+
+		skel->bss->has_cpu = 1;
+		fd = bpf_map__fd(skel->maps.cpu_filter);
+
+		for (i = 0; i < ncpus; i++) {
+			cpu = cpu_map__cpu(ftrace->evlist->core.cpus, i);
+			bpf_map_update_elem(fd, &cpu, &val, BPF_ANY);
+		}
+	}
+
+	if (target__has_task(&ftrace->target) || target__none(&ftrace->target)) {
+		u32 pid;
+		u8 val = 1;
+
+		skel->bss->has_task = 1;
+		fd = bpf_map__fd(skel->maps.task_filter);
+
+		for (i = 0; i < ntasks; i++) {
+			pid = perf_thread_map__pid(ftrace->evlist->core.threads, i);
+			bpf_map_update_elem(fd, &pid, &val, BPF_ANY);
+		}
+	}
+
 	skel->links.func_begin = bpf_program__attach_kprobe(skel->progs.func_begin,
 							    false, func->name);
 	if (IS_ERR(skel->links.func_begin)) {
diff --git a/tools/perf/util/bpf_skel/func_latency.bpf.c b/tools/perf/util/bpf_skel/func_latency.bpf.c
index ccd96b09fc42..ea94187fe443 100644
--- a/tools/perf/util/bpf_skel/func_latency.bpf.c
+++ b/tools/perf/util/bpf_skel/func_latency.bpf.c
@@ -37,6 +37,8 @@ struct {
 
 
 int enabled = 0;
+int has_cpu = 0;
+int has_task = 0;
 
 SEC("kprobe/func")
 int BPF_PROG(func_begin)
@@ -47,6 +49,25 @@ int BPF_PROG(func_begin)
 		return 0;
 
 	key = bpf_get_current_pid_tgid();
+
+	if (has_cpu) {
+		__u32 cpu = bpf_get_smp_processor_id();
+		__u8 *ok;
+
+		ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
+		if (!ok)
+			return 0;
+	}
+
+	if (has_task) {
+		__u32 pid = key & 0xffffffff;
+		__u8 *ok;
+
+		ok = bpf_map_lookup_elem(&task_filter, &pid);
+		if (!ok)
+			return 0;
+	}
+
 	now = bpf_ktime_get_ns();
 
 	// overwrite timestamp for nested functions
-- 
cgit v1.2.3


From b1a7288dedc6caf9023f2676b4f5ed34cf0d4029 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 15 Dec 2021 23:48:54 +0000
Subject: bpf, selftests: Add test case trying to taint map value pointer

Add a test case which tries to taint map value pointer arithmetic into a
unknown scalar with subsequent export through the map.

Before fix:

  # ./test_verifier 1186
  #1186/u map access: trying to leak tained dst reg FAIL
  Unexpected success to load!
  verification time 24 usec
  stack depth 8
  processed 15 insns (limit 1000000) max_states_per_insn 0 total_states 1 peak_states 1 mark_read 1
  #1186/p map access: trying to leak tained dst reg FAIL
  Unexpected success to load!
  verification time 8 usec
  stack depth 8
  processed 15 insns (limit 1000000) max_states_per_insn 0 total_states 1 peak_states 1 mark_read 1
  Summary: 0 PASSED, 0 SKIPPED, 2 FAILED

After fix:

  # ./test_verifier 1186
  #1186/u map access: trying to leak tained dst reg OK
  #1186/p map access: trying to leak tained dst reg OK
  Summary: 2 PASSED, 0 SKIPPED, 0 FAILED

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/verifier/value_ptr_arith.c       | 23 ++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c
index 2debba4e8a3a..4d347bc53aa2 100644
--- a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c
+++ b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c
@@ -1077,6 +1077,29 @@
 	.errstr = "R0 invalid mem access 'inv'",
 	.errstr_unpriv = "R0 pointer -= pointer prohibited",
 },
+{
+	"map access: trying to leak tained dst reg",
+	.insns = {
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+	BPF_LD_MAP_FD(BPF_REG_1, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+	BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+	BPF_EXIT_INSN(),
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+	BPF_MOV32_IMM(BPF_REG_1, 0xFFFFFFFF),
+	BPF_MOV32_REG(BPF_REG_1, BPF_REG_1),
+	BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+	BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_map_array_48b = { 4 },
+	.result = REJECT,
+	.errstr = "math between map_value pointer and 4294967295 is not allowed",
+},
 {
 	"32bit pkt_ptr -= scalar",
 	.insns = {
-- 
cgit v1.2.3


From a840974e96fd51b47c79301522bccf23cc8bb388 Mon Sep 17 00:00:00 2001
From: Thomas Richter <tmricht@linux.ibm.com>
Date: Thu, 16 Dec 2021 16:14:54 +0100
Subject: perf test: Test 73 Sig_trap fails on s390

In Linux next commit 5504f67944484495 ("perf test sigtrap: Add basic
stress test for sigtrap handling") introduced the new test which uses
breakpoint events.  These events are not supported on s390 and PowerPC
and always fail:

  # perf test -F 73
  73: Sigtrap                                                         : FAILED!
  #

Fix it the same way as in the breakpoint tests in file
tests/bp_account.c where these type of tests are skipped on s390 and
PowerPC platforms.

With this patch skip this test on both platforms.

Output after:

  # perf test -F 73
  73: Sigtrap
  #

Fixes: 5504f67944484495 ("perf test sigtrap: Add basic stress test for sigtrap handling")
Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
Acked-by: Marco Elver <elver@google.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Link: https://lore.kernel.org/r/20211216151454.752066-1-tmricht@linux.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/sigtrap.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/tests/sigtrap.c b/tools/perf/tests/sigtrap.c
index 1004bf0e7cc9..1f147fe6595f 100644
--- a/tools/perf/tests/sigtrap.c
+++ b/tools/perf/tests/sigtrap.c
@@ -22,6 +22,19 @@
 #include "tests.h"
 #include "../perf-sys.h"
 
+/*
+ * PowerPC and S390 do not support creation of instruction breakpoints using the
+ * perf_event interface.
+ *
+ * Just disable the test for these architectures until these issues are
+ * resolved.
+ */
+#if defined(__powerpc__) || defined(__s390x__)
+#define BP_ACCOUNT_IS_SUPPORTED 0
+#else
+#define BP_ACCOUNT_IS_SUPPORTED 1
+#endif
+
 #define NUM_THREADS 5
 
 static struct {
@@ -122,6 +135,11 @@ static int test__sigtrap(struct test_suite *test __maybe_unused, int subtest __m
 	char sbuf[STRERR_BUFSIZE];
 	int i, fd, ret = TEST_FAIL;
 
+	if (!BP_ACCOUNT_IS_SUPPORTED) {
+		pr_debug("Test not supported on this architecture");
+		return TEST_SKIP;
+	}
+
 	pthread_barrier_init(&barrier, NULL, NUM_THREADS + 1);
 
 	action.sa_flags = SA_SIGINFO | SA_NODEFER;
-- 
cgit v1.2.3


From cebdb7374577ac6e14afb11311af8c2c44a259fa Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Thu, 16 Dec 2021 16:38:38 +0000
Subject: tools: Help cross-building with clang

Cross-compilation with clang uses the -target parameter rather than a
toolchain prefix. Just like the kernel Makefile, add that parameter to
CFLAGS when CROSS_COMPILE is set.

Unlike the kernel Makefile, we use the --sysroot and --gcc-toolchain
options because unlike the kernel, tools require standard libraries.
Commit c91d4e47e10e ("Makefile: Remove '--gcc-toolchain' flag") provides
some background about --gcc-toolchain. Normally clang finds on its own
the additional utilities and libraries that it needs (for example GNU ld
or glibc). On some systems however, this autodetection doesn't work.
There, our only recourse is asking GCC directly, and pass the result to
--sysroot and --gcc-toolchain. Of course that only works when a cross
GCC is available.

Autodetection worked fine on Debian, but to use the aarch64-linux-gnu
toolchain from Archlinux I needed both --sysroot (for crt1.o) and
--gcc-toolchain (for crtbegin.o, -lgcc). The --prefix parameter wasn't
needed there, but it might be useful on other distributions.

Use the CLANG_CROSS_FLAGS variable instead of CLANG_FLAGS because it
allows tools such as bpftool, that need to build both host and target
binaries, to easily filter out the cross-build flags from CFLAGS.

Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Quentin Monnet <quentin@isovalent.com>
Acked-by: Nick Desaulniers <ndesaulniers@google.com>
Link: https://lore.kernel.org/bpf/20211216163842.829836-2-jean-philippe@linaro.org
---
 tools/scripts/Makefile.include | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include
index 071312f5eb92..b0be5f40a3f1 100644
--- a/tools/scripts/Makefile.include
+++ b/tools/scripts/Makefile.include
@@ -87,7 +87,18 @@ LLVM_STRIP	?= llvm-strip
 
 ifeq ($(CC_NO_CLANG), 1)
 EXTRA_WARNINGS += -Wstrict-aliasing=3
-endif
+
+else ifneq ($(CROSS_COMPILE),)
+CLANG_CROSS_FLAGS := --target=$(notdir $(CROSS_COMPILE:%-=%))
+GCC_TOOLCHAIN_DIR := $(dir $(shell which $(CROSS_COMPILE)gcc))
+ifneq ($(GCC_TOOLCHAIN_DIR),)
+CLANG_CROSS_FLAGS += --prefix=$(GCC_TOOLCHAIN_DIR)$(notdir $(CROSS_COMPILE))
+CLANG_CROSS_FLAGS += --sysroot=$(shell $(CROSS_COMPILE)gcc -print-sysroot)
+CLANG_CROSS_FLAGS += --gcc-toolchain=$(realpath $(GCC_TOOLCHAIN_DIR)/..)
+endif # GCC_TOOLCHAIN_DIR
+CFLAGS += $(CLANG_CROSS_FLAGS)
+AFLAGS += $(CLANG_CROSS_FLAGS)
+endif # CROSS_COMPILE
 
 # Hack to avoid type-punned warnings on old systems such as RHEL5:
 # We should be changing CFLAGS and checking gcc version, but this
-- 
cgit v1.2.3


From bf1be903461a404a9d1c720b0872501ca35abc89 Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Thu, 16 Dec 2021 16:38:39 +0000
Subject: tools/resolve_btfids: Support cross-building the kernel with clang

The CROSS_COMPILE variable may be present during resolve_btfids build if
the kernel is being cross-built. Since resolve_btfids is always executed
on the host, we set CC to HOSTCC in order to use the host toolchain when
cross-building with GCC. But instead of a toolchain prefix, cross-build
with clang uses a "-target" parameter, which Makefile.include deduces
from the CROSS_COMPILE variable. In order to avoid cross-building
libbpf, clear CROSS_COMPILE before building resolve_btfids.

Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Quentin Monnet <quentin@isovalent.com>
Link: https://lore.kernel.org/bpf/20211216163842.829836-3-jean-philippe@linaro.org
---
 tools/bpf/resolve_btfids/Makefile | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/bpf/resolve_btfids/Makefile b/tools/bpf/resolve_btfids/Makefile
index 751643f860b2..9ddeca947635 100644
--- a/tools/bpf/resolve_btfids/Makefile
+++ b/tools/bpf/resolve_btfids/Makefile
@@ -19,6 +19,7 @@ CC       = $(HOSTCC)
 LD       = $(HOSTLD)
 ARCH     = $(HOSTARCH)
 RM      ?= rm
+CROSS_COMPILE =
 
 OUTPUT ?= $(srctree)/tools/bpf/resolve_btfids/
 
-- 
cgit v1.2.3


From 4980beb4cda2bc413a3a044e1851b0daaf137bf6 Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Thu, 16 Dec 2021 16:38:40 +0000
Subject: tools/libbpf: Enable cross-building with clang

Cross-building using clang requires passing the "-target" flag rather
than using the CROSS_COMPILE prefix. Makefile.include transforms
CROSS_COMPILE into CLANG_CROSS_FLAGS. Add them to the CFLAGS.

Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Quentin Monnet <quentin@isovalent.com>
Link: https://lore.kernel.org/bpf/20211216163842.829836-4-jean-philippe@linaro.org
---
 tools/lib/bpf/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index 5f7086fae31c..f947b61b2107 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -90,6 +90,7 @@ override CFLAGS += -Werror -Wall
 override CFLAGS += $(INCLUDES)
 override CFLAGS += -fvisibility=hidden
 override CFLAGS += -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64
+override CFLAGS += $(CLANG_CROSS_FLAGS)
 
 # flags specific for shared library
 SHLIB_FLAGS := -DSHARED -fPIC
@@ -162,7 +163,7 @@ $(BPF_HELPER_DEFS): $(srctree)/tools/include/uapi/linux/bpf.h
 $(OUTPUT)libbpf.so: $(OUTPUT)libbpf.so.$(LIBBPF_VERSION)
 
 $(OUTPUT)libbpf.so.$(LIBBPF_VERSION): $(BPF_IN_SHARED) $(VERSION_SCRIPT)
-	$(QUIET_LINK)$(CC) $(LDFLAGS) \
+	$(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) \
 		--shared -Wl,-soname,libbpf.so.$(LIBBPF_MAJOR_VERSION) \
 		-Wl,--version-script=$(VERSION_SCRIPT) $< -lelf -lz -o $@
 	@ln -sf $(@F) $(OUTPUT)libbpf.so
-- 
cgit v1.2.3


From bdadbb44c90aedaa74d46f1b113bd845774efa39 Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Thu, 16 Dec 2021 16:38:41 +0000
Subject: bpftool: Enable cross-building with clang

Cross-building using clang requires passing the "-target" flag rather
than using the CROSS_COMPILE prefix. Makefile.include transforms
CROSS_COMPILE into CLANG_CROSS_FLAGS, and adds that to CFLAGS. Remove
the cross flags for the bootstrap bpftool, and erase the CROSS_COMPILE
flag for the bootstrap libbpf.

Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Quentin Monnet <quentin@isovalent.com>
Link: https://lore.kernel.org/bpf/20211216163842.829836-5-jean-philippe@linaro.org
---
 tools/bpf/bpftool/Makefile | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index 42eb8eee3d89..83369f55df61 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -57,7 +57,7 @@ $(LIBBPF_INTERNAL_HDRS): $(LIBBPF_HDRS_DIR)/%.h: $(BPF_DIR)/%.h | $(LIBBPF_HDRS_
 $(LIBBPF_BOOTSTRAP): $(wildcard $(BPF_DIR)/*.[ch] $(BPF_DIR)/Makefile) | $(LIBBPF_BOOTSTRAP_OUTPUT)
 	$(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(LIBBPF_BOOTSTRAP_OUTPUT) \
 		DESTDIR=$(LIBBPF_BOOTSTRAP_DESTDIR) prefix= \
-		ARCH= CC=$(HOSTCC) LD=$(HOSTLD) $@ install_headers
+		ARCH= CROSS_COMPILE= CC=$(HOSTCC) LD=$(HOSTLD) $@ install_headers
 
 $(LIBBPF_BOOTSTRAP_INTERNAL_HDRS): $(LIBBPF_BOOTSTRAP_HDRS_DIR)/%.h: $(BPF_DIR)/%.h | $(LIBBPF_BOOTSTRAP_HDRS_DIR)
 	$(call QUIET_INSTALL, $@)
@@ -152,6 +152,9 @@ CFLAGS += -DHAVE_LIBBFD_SUPPORT
 SRCS += $(BFD_SRCS)
 endif
 
+HOST_CFLAGS = $(subst -I$(LIBBPF_INCLUDE),-I$(LIBBPF_BOOTSTRAP_INCLUDE),\
+		$(subst $(CLANG_CROSS_FLAGS),,$(CFLAGS)))
+
 BPFTOOL_BOOTSTRAP := $(BOOTSTRAP_OUTPUT)bpftool
 
 BOOTSTRAP_OBJS = $(addprefix $(BOOTSTRAP_OUTPUT),main.o common.o json_writer.o gen.o btf.o xlated_dumper.o btf_dumper.o disasm.o)
@@ -202,7 +205,7 @@ endif
 CFLAGS += $(if $(BUILD_BPF_SKELS),,-DBPFTOOL_WITHOUT_SKELETONS)
 
 $(BOOTSTRAP_OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c
-	$(QUIET_CC)$(HOSTCC) $(CFLAGS) -c -MMD $< -o $@
+	$(QUIET_CC)$(HOSTCC) $(HOST_CFLAGS) -c -MMD $< -o $@
 
 $(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c
 	$(QUIET_CC)$(CC) $(CFLAGS) -c -MMD $< -o $@
@@ -213,15 +216,13 @@ ifneq ($(feature-zlib), 1)
 endif
 
 $(BPFTOOL_BOOTSTRAP): $(BOOTSTRAP_OBJS) $(LIBBPF_BOOTSTRAP)
-	$(QUIET_LINK)$(HOSTCC) $(CFLAGS) $(LDFLAGS) $(BOOTSTRAP_OBJS) $(LIBS_BOOTSTRAP) -o $@
+	$(QUIET_LINK)$(HOSTCC) $(HOST_CFLAGS) $(LDFLAGS) $(BOOTSTRAP_OBJS) $(LIBS_BOOTSTRAP) -o $@
 
 $(OUTPUT)bpftool: $(OBJS) $(LIBBPF)
 	$(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $(OBJS) $(LIBS) -o $@
 
 $(BOOTSTRAP_OUTPUT)%.o: %.c $(LIBBPF_BOOTSTRAP_INTERNAL_HDRS) | $(BOOTSTRAP_OUTPUT)
-	$(QUIET_CC)$(HOSTCC) \
-		$(subst -I$(LIBBPF_INCLUDE),-I$(LIBBPF_BOOTSTRAP_INCLUDE),$(CFLAGS)) \
-		-c -MMD $< -o $@
+	$(QUIET_CC)$(HOSTCC) $(HOST_CFLAGS) -c -MMD $< -o $@
 
 $(OUTPUT)%.o: %.c
 	$(QUIET_CC)$(CC) $(CFLAGS) -c -MMD $< -o $@
-- 
cgit v1.2.3


From bb7b75e860eec31aa67b83935849fdc46418c13e Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Thu, 16 Dec 2021 16:38:42 +0000
Subject: tools/runqslower: Enable cross-building with clang

Cross-building using clang requires passing the "-target" flag rather
than using the CROSS_COMPILE prefix. Makefile.include transforms
CROSS_COMPILE into CLANG_CROSS_FLAGS. Add them to CFLAGS, and erase
CROSS_COMPILE for the bpftool build, since it needs to be executed on
the host.

Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Quentin Monnet <quentin@isovalent.com>
Link: https://lore.kernel.org/bpf/20211216163842.829836-6-jean-philippe@linaro.org
---
 tools/bpf/runqslower/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile
index 8791d0e2762b..da6de16a3dfb 100644
--- a/tools/bpf/runqslower/Makefile
+++ b/tools/bpf/runqslower/Makefile
@@ -12,7 +12,7 @@ BPFOBJ := $(BPFOBJ_OUTPUT)libbpf.a
 BPF_DESTDIR := $(BPFOBJ_OUTPUT)
 BPF_INCLUDE := $(BPF_DESTDIR)/include
 INCLUDES := -I$(OUTPUT) -I$(BPF_INCLUDE) -I$(abspath ../../include/uapi)
-CFLAGS := -g -Wall
+CFLAGS := -g -Wall $(CLANG_CROSS_FLAGS)
 
 # Try to detect best kernel BTF source
 KERNEL_REL := $(shell uname -r)
@@ -88,4 +88,4 @@ $(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(BPFOBJ_OU
 
 $(DEFAULT_BPFTOOL): $(BPFOBJ) | $(BPFTOOL_OUTPUT)
 	$(Q)$(MAKE) $(submake_extras) -C ../bpftool OUTPUT=$(BPFTOOL_OUTPUT)   \
-		    CC=$(HOSTCC) LD=$(HOSTLD)
+		    ARCH= CROSS_COMPILE= CC=$(HOSTCC) LD=$(HOSTLD)
-- 
cgit v1.2.3


From ea79020a2d9eea62b12d90f0c11b7d70fcadc172 Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Thu, 16 Dec 2021 16:38:43 +0000
Subject: selftests/bpf: Enable cross-building with clang

Cross building using clang requires passing the "-target" flag rather
than using the CROSS_COMPILE prefix. Makefile.include transforms
CROSS_COMPILE into CLANG_CROSS_FLAGS. Clear CROSS_COMPILE for bpftool
and the host libbpf, and use the clang flags for urandom_read and bench.

Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Quentin Monnet <quentin@isovalent.com>
Link: https://lore.kernel.org/bpf/20211216163842.829836-7-jean-philippe@linaro.org
---
 tools/testing/selftests/bpf/Makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 43f6f3159431..42ffc24e9e71 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -170,7 +170,7 @@ $(OUTPUT)/%:%.c
 
 $(OUTPUT)/urandom_read: urandom_read.c
 	$(call msg,BINARY,,$@)
-	$(Q)$(CC) $(LDFLAGS) $< $(LDLIBS) -Wl,--build-id=sha1 -o $@
+	$(Q)$(CC) $(CFLAGS) $(LDFLAGS) $< $(LDLIBS) -Wl,--build-id=sha1 -o $@
 
 $(OUTPUT)/bpf_testmod.ko: $(VMLINUX_BTF) $(wildcard bpf_testmod/Makefile bpf_testmod/*.[ch])
 	$(call msg,MOD,,$@)
@@ -217,7 +217,7 @@ BPFTOOL ?= $(DEFAULT_BPFTOOL)
 $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile)    \
 		    $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool
 	$(Q)$(MAKE) $(submake_extras)  -C $(BPFTOOLDIR)			       \
-		    CC=$(HOSTCC) LD=$(HOSTLD)				       \
+		    ARCH= CROSS_COMPILE= CC=$(HOSTCC) LD=$(HOSTLD) 	       \
 		    EXTRA_CFLAGS='-g -O0'				       \
 		    OUTPUT=$(HOST_BUILD_DIR)/bpftool/			       \
 		    LIBBPF_OUTPUT=$(HOST_BUILD_DIR)/libbpf/		       \
@@ -248,7 +248,7 @@ $(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)		       \
 		$(APIDIR)/linux/bpf.h					       \
 		| $(HOST_BUILD_DIR)/libbpf
 	$(Q)$(MAKE) $(submake_extras) -C $(BPFDIR)                             \
-		    EXTRA_CFLAGS='-g -O0'				       \
+		    EXTRA_CFLAGS='-g -O0' ARCH= CROSS_COMPILE=		       \
 		    OUTPUT=$(HOST_BUILD_DIR)/libbpf/ CC=$(HOSTCC) LD=$(HOSTLD) \
 		    DESTDIR=$(HOST_SCRATCH_DIR)/ prefix= all install_headers
 endif
@@ -551,7 +551,7 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \
 		 $(OUTPUT)/bench_bpf_loop.o \
 		 $(OUTPUT)/bench_strncmp.o
 	$(call msg,BINARY,,$@)
-	$(Q)$(CC) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@
+	$(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@
 
 EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)	\
 	prog_tests/tests.h map_tests/tests.h verifier/tests.h		\
-- 
cgit v1.2.3


From 7edc3fcbf9a2b2e3df53c9656a9f85bf807affac Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 15 Dec 2021 12:35:34 -0800
Subject: selftest/bpf: Add a test that reads various addresses.

Add a function to bpf_testmod that returns invalid kernel and user addresses.
Then attach an fexit program to that function that tries to read
memory through these addresses.

This logic checks that bpf_probe_read_kernel and BPF_PROBE_MEM logic is sane.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 .../testing/selftests/bpf/bpf_testmod/bpf_testmod.c  | 20 ++++++++++++++++++++
 .../testing/selftests/bpf/progs/test_module_attach.c | 12 ++++++++++++
 2 files changed, 32 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index 5d52ea2768df..df3b292a8ffe 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -33,6 +33,22 @@ noinline int bpf_testmod_loop_test(int n)
 	return sum;
 }
 
+__weak noinline struct file *bpf_testmod_return_ptr(int arg)
+{
+	static struct file f = {};
+
+	switch (arg) {
+	case 1: return (void *)EINVAL;		/* user addr */
+	case 2: return (void *)0xcafe4a11;	/* user addr */
+	case 3: return (void *)-EINVAL;		/* canonical, but invalid */
+	case 4: return (void *)(1ull << 60);	/* non-canonical and invalid */
+	case 5: return (void *)~(1ull << 30);	/* trigger extable */
+	case 6: return &f;			/* valid addr */
+	case 7: return (void *)((long)&f | 1);	/* kernel tricks */
+	default: return NULL;
+	}
+}
+
 noinline ssize_t
 bpf_testmod_test_read(struct file *file, struct kobject *kobj,
 		      struct bin_attribute *bin_attr,
@@ -43,6 +59,10 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj,
 		.off = off,
 		.len = len,
 	};
+	int i = 1;
+
+	while (bpf_testmod_return_ptr(i))
+		i++;
 
 	/* This is always true. Use the check to make sure the compiler
 	 * doesn't remove bpf_testmod_loop_test.
diff --git a/tools/testing/selftests/bpf/progs/test_module_attach.c b/tools/testing/selftests/bpf/progs/test_module_attach.c
index b36857093f71..50ce16d02da7 100644
--- a/tools/testing/selftests/bpf/progs/test_module_attach.c
+++ b/tools/testing/selftests/bpf/progs/test_module_attach.c
@@ -87,6 +87,18 @@ int BPF_PROG(handle_fexit,
 	return 0;
 }
 
+SEC("fexit/bpf_testmod_return_ptr")
+int BPF_PROG(handle_fexit_ret, int arg, struct file *ret)
+{
+	long buf = 0;
+
+	bpf_probe_read_kernel(&buf, 8, ret);
+	bpf_probe_read_kernel(&buf, 8, (char *)ret + 256);
+	*(volatile long long *)ret;
+	*(volatile int *)&ret->f_mode;
+	return 0;
+}
+
 __u32 fmod_ret_read_sz = 0;
 
 SEC("fmod_ret/bpf_testmod_test_read")
-- 
cgit v1.2.3


From c2fcbf81c332b42382a0c439bfe2414a241e4f5b Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 16 Dec 2021 11:16:30 -0800
Subject: bpf, selftests: Fix racing issue in btf_skc_cls_ingress test

The libbpf CI reported occasional failure in btf_skc_cls_ingress:

  test_syncookie:FAIL:Unexpected syncookie states gen_cookie:80326634 recv_cookie:0
  bpf prog error at line 97

"error at line 97" means the bpf prog cannot find the listening socket
when the final ack is received.  It then skipped processing
the syncookie in the final ack which then led to "recv_cookie:0".

The problem is the userspace program did not do accept() and went
ahead to close(listen_fd) before the kernel (and the bpf prog) had
a chance to process the final ack.

The fix is to add accept() call so that the userspace will wait for
the kernel to finish processing the final ack first before close()-ing
everything.

Fixes: 9a856cae2217 ("bpf: selftest: Add test_btf_skc_cls_ingress")
Reported-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211216191630.466151-1-kafai@fb.com
---
 .../selftests/bpf/prog_tests/btf_skc_cls_ingress.c       | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c b/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c
index 762f6a9da8b5..664ffc0364f4 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c
@@ -90,7 +90,7 @@ static void print_err_line(void)
 
 static void test_conn(void)
 {
-	int listen_fd = -1, cli_fd = -1, err;
+	int listen_fd = -1, cli_fd = -1, srv_fd = -1, err;
 	socklen_t addrlen = sizeof(srv_sa6);
 	int srv_port;
 
@@ -112,6 +112,10 @@ static void test_conn(void)
 	if (CHECK_FAIL(cli_fd == -1))
 		goto done;
 
+	srv_fd = accept(listen_fd, NULL, NULL);
+	if (CHECK_FAIL(srv_fd == -1))
+		goto done;
+
 	if (CHECK(skel->bss->listen_tp_sport != srv_port ||
 		  skel->bss->req_sk_sport != srv_port,
 		  "Unexpected sk src port",
@@ -134,11 +138,13 @@ done:
 		close(listen_fd);
 	if (cli_fd != -1)
 		close(cli_fd);
+	if (srv_fd != -1)
+		close(srv_fd);
 }
 
 static void test_syncookie(void)
 {
-	int listen_fd = -1, cli_fd = -1, err;
+	int listen_fd = -1, cli_fd = -1, srv_fd = -1, err;
 	socklen_t addrlen = sizeof(srv_sa6);
 	int srv_port;
 
@@ -161,6 +167,10 @@ static void test_syncookie(void)
 	if (CHECK_FAIL(cli_fd == -1))
 		goto done;
 
+	srv_fd = accept(listen_fd, NULL, NULL);
+	if (CHECK_FAIL(srv_fd == -1))
+		goto done;
+
 	if (CHECK(skel->bss->listen_tp_sport != srv_port,
 		  "Unexpected tp src port",
 		  "listen_tp_sport:%u expected:%u\n",
@@ -188,6 +198,8 @@ done:
 		close(listen_fd);
 	if (cli_fd != -1)
 		close(cli_fd);
+	if (srv_fd != -1)
+		close(srv_fd);
 }
 
 struct test {
-- 
cgit v1.2.3


From 90091c367e74d5b58d9ebe979cc363f7468f58d3 Mon Sep 17 00:00:00 2001
From: Misono Tomohiro <misono.tomohiro@jp.fujitsu.com>
Date: Thu, 5 Aug 2021 19:12:36 +0900
Subject: selftest/lkdtm: Skip stack-entropy test if lkdtm is not available

Exit with return code 4 if lkdtm is not available like other tests
in order to properly skip the test.

Signed-off-by: Misono Tomohiro <misono.tomohiro@jp.fujitsu.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20210805101236.1140381-1-misono.tomohiro@jp.fujitsu.com
---
 tools/testing/selftests/lkdtm/stack-entropy.sh | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/lkdtm/stack-entropy.sh b/tools/testing/selftests/lkdtm/stack-entropy.sh
index 1b4d95d575f8..14fedeef762e 100755
--- a/tools/testing/selftests/lkdtm/stack-entropy.sh
+++ b/tools/testing/selftests/lkdtm/stack-entropy.sh
@@ -4,13 +4,27 @@
 # Measure kernel stack entropy by sampling via LKDTM's REPORT_STACK test.
 set -e
 samples="${1:-1000}"
+TRIGGER=/sys/kernel/debug/provoke-crash/DIRECT
+KSELFTEST_SKIP_TEST=4
+
+# Verify we have LKDTM available in the kernel.
+if [ ! -r $TRIGGER ] ; then
+	/sbin/modprobe -q lkdtm || true
+	if [ ! -r $TRIGGER ] ; then
+		echo "Cannot find $TRIGGER (missing CONFIG_LKDTM?)"
+	else
+		echo "Cannot write $TRIGGER (need to run as root?)"
+	fi
+	# Skip this test
+	exit $KSELFTEST_SKIP_TEST
+fi
 
 # Capture dmesg continuously since it may fill up depending on sample size.
 log=$(mktemp -t stack-entropy-XXXXXX)
 dmesg --follow >"$log" & pid=$!
 report=-1
 for i in $(seq 1 $samples); do
-        echo "REPORT_STACK" >/sys/kernel/debug/provoke-crash/DIRECT
+        echo "REPORT_STACK" > $TRIGGER
 	if [ -t 1 ]; then
 		percent=$(( 100 * $i / $samples ))
 		if [ "$percent" -ne "$report" ]; then
-- 
cgit v1.2.3


From 0f55f9ed21f96630c6ec96805d42f92c0b458b37 Mon Sep 17 00:00:00 2001
From: Christy Lee <christylee@fb.com>
Date: Thu, 16 Dec 2021 13:33:56 -0800
Subject: bpf: Only print scratched registers and stack slots to verifier logs.

When printing verifier state for any log level, print full verifier
state only on function calls or on errors. Otherwise, only print the
registers and stack slots that were accessed.

Log size differences:

verif_scale_loop6 before: 234566564
verif_scale_loop6 after: 72143943
69% size reduction

kfree_skb before: 166406
kfree_skb after: 55386
69% size reduction

Before:

156: (61) r0 = *(u32 *)(r1 +0)
157: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1=ctx(id=0,off=0,imm=0) R2_w=invP0 R10=fp0 fp-8_w=00000000 fp-16_w=00\
000000 fp-24_w=00000000 fp-32_w=00000000 fp-40_w=00000000 fp-48_w=00000000 fp-56_w=00000000 fp-64_w=00000000 fp-72_w=00000000 fp-80_w=00000\
000 fp-88_w=00000000 fp-96_w=00000000 fp-104_w=00000000 fp-112_w=00000000 fp-120_w=00000000 fp-128_w=00000000 fp-136_w=00000000 fp-144_w=00\
000000 fp-152_w=00000000 fp-160_w=00000000 fp-168_w=00000000 fp-176_w=00000000 fp-184_w=00000000 fp-192_w=00000000 fp-200_w=00000000 fp-208\
_w=00000000 fp-216_w=00000000 fp-224_w=00000000 fp-232_w=00000000 fp-240_w=00000000 fp-248_w=00000000 fp-256_w=00000000 fp-264_w=00000000 f\
p-272_w=00000000 fp-280_w=00000000 fp-288_w=00000000 fp-296_w=00000000 fp-304_w=00000000 fp-312_w=00000000 fp-320_w=00000000 fp-328_w=00000\
000 fp-336_w=00000000 fp-344_w=00000000 fp-352_w=00000000 fp-360_w=00000000 fp-368_w=00000000 fp-376_w=00000000 fp-384_w=00000000 fp-392_w=\
00000000 fp-400_w=00000000 fp-408_w=00000000 fp-416_w=00000000 fp-424_w=00000000 fp-432_w=00000000 fp-440_w=00000000 fp-448_w=00000000
; return skb->len;
157: (95) exit
Func#4 is safe for any args that match its prototype
Validating get_constant() func#5...
158: R1=invP(id=0) R10=fp0
; int get_constant(long val)
158: (bf) r0 = r1
159: R0_w=invP(id=1) R1=invP(id=1) R10=fp0
; return val - 122;
159: (04) w0 += -122
160: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1=invP(id=1) R10=fp0
; return val - 122;
160: (95) exit
Func#5 is safe for any args that match its prototype
Validating get_skb_ifindex() func#6...
161: R1=invP(id=0) R2=ctx(id=0,off=0,imm=0) R3=invP(id=0) R10=fp0
; int get_skb_ifindex(int val, struct __sk_buff *skb, int var)
161: (bc) w0 = w3
162: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1=invP(id=0) R2=ctx(id=0,off=0,imm=0) R3=invP(id=0) R10=fp0

After:

156: (61) r0 = *(u32 *)(r1 +0)
157: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1=ctx(id=0,off=0,imm=0)
; return skb->len;
157: (95) exit
Func#4 is safe for any args that match its prototype
Validating get_constant() func#5...
158: R1=invP(id=0) R10=fp0
; int get_constant(long val)
158: (bf) r0 = r1
159: R0_w=invP(id=1) R1=invP(id=1)
; return val - 122;
159: (04) w0 += -122
160: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff))
; return val - 122;
160: (95) exit
Func#5 is safe for any args that match its prototype
Validating get_skb_ifindex() func#6...
161: R1=invP(id=0) R2=ctx(id=0,off=0,imm=0) R3=invP(id=0) R10=fp0
; int get_skb_ifindex(int val, struct __sk_buff *skb, int var)
161: (bc) w0 = w3
162: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R3=invP(id=0)

Signed-off-by: Christy Lee <christylee@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211216213358.3374427-2-christylee@fb.com
---
 include/linux/bpf_verifier.h                   |  7 +++
 kernel/bpf/verifier.c                          | 83 +++++++++++++++++++++-----
 tools/testing/selftests/bpf/prog_tests/align.c | 30 +++++-----
 3 files changed, 91 insertions(+), 29 deletions(-)

(limited to 'tools')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 182b16a91084..c66f238c538d 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -474,6 +474,13 @@ struct bpf_verifier_env {
 	/* longest register parentage chain walked for liveness marking */
 	u32 longest_mark_read_walk;
 	bpfptr_t fd_array;
+
+	/* bit mask to keep track of whether a register has been accessed
+	 * since the last time the function state was printed
+	 */
+	u32 scratched_regs;
+	/* Same as scratched_regs but for stack slots */
+	u64 scratched_stack_slots;
 };
 
 __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f0604796132f..ded6e816dcd9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -609,6 +609,44 @@ static const char *kernel_type_name(const struct btf* btf, u32 id)
 	return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
 }
 
+static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno)
+{
+	env->scratched_regs |= 1U << regno;
+}
+
+static void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi)
+{
+	env->scratched_stack_slots |= 1UL << spi;
+}
+
+static bool reg_scratched(const struct bpf_verifier_env *env, u32 regno)
+{
+	return (env->scratched_regs >> regno) & 1;
+}
+
+static bool stack_slot_scratched(const struct bpf_verifier_env *env, u64 regno)
+{
+	return (env->scratched_stack_slots >> regno) & 1;
+}
+
+static bool verifier_state_scratched(const struct bpf_verifier_env *env)
+{
+	return env->scratched_regs || env->scratched_stack_slots;
+}
+
+static void mark_verifier_state_clean(struct bpf_verifier_env *env)
+{
+	env->scratched_regs = 0U;
+	env->scratched_stack_slots = 0UL;
+}
+
+/* Used for printing the entire verifier state. */
+static void mark_verifier_state_scratched(struct bpf_verifier_env *env)
+{
+	env->scratched_regs = ~0U;
+	env->scratched_stack_slots = ~0UL;
+}
+
 /* The reg state of a pointer or a bounded scalar was saved when
  * it was spilled to the stack.
  */
@@ -624,7 +662,8 @@ static void scrub_spilled_slot(u8 *stype)
 }
 
 static void print_verifier_state(struct bpf_verifier_env *env,
-				 const struct bpf_func_state *state)
+				 const struct bpf_func_state *state,
+				 bool print_all)
 {
 	const struct bpf_reg_state *reg;
 	enum bpf_reg_type t;
@@ -637,6 +676,8 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 		t = reg->type;
 		if (t == NOT_INIT)
 			continue;
+		if (!print_all && !reg_scratched(env, i))
+			continue;
 		verbose(env, " R%d", i);
 		print_liveness(env, reg->live);
 		verbose(env, "=%s", reg_type_str[t]);
@@ -726,6 +767,8 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 		types_buf[BPF_REG_SIZE] = 0;
 		if (!valid)
 			continue;
+		if (!print_all && !stack_slot_scratched(env, i))
+			continue;
 		verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
 		print_liveness(env, state->stack[i].spilled_ptr.live);
 		if (is_spilled_reg(&state->stack[i])) {
@@ -751,6 +794,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 	if (state->in_async_callback_fn)
 		verbose(env, " async_cb");
 	verbose(env, "\n");
+	mark_verifier_state_clean(env);
 }
 
 /* copy array src of length n * size bytes to dst. dst is reallocated if it's too
@@ -1541,6 +1585,7 @@ static void init_func_state(struct bpf_verifier_env *env,
 	state->frameno = frameno;
 	state->subprogno = subprogno;
 	init_reg_state(env, state);
+	mark_verifier_state_scratched(env);
 }
 
 /* Similar to push_stack(), but for async callbacks */
@@ -2228,6 +2273,8 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 		return -EINVAL;
 	}
 
+	mark_reg_scratched(env, regno);
+
 	reg = &regs[regno];
 	rw64 = is_reg64(env, insn, regno, reg, t);
 	if (t == SRC_OP) {
@@ -2678,7 +2725,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
 			reg->precise = true;
 		}
 		if (env->log.level & BPF_LOG_LEVEL) {
-			print_verifier_state(env, func);
+			print_verifier_state(env, func, false);
 			verbose(env, "parent %s regs=%x stack=%llx marks\n",
 				new_marks ? "didn't have" : "already had",
 				reg_mask, stack_mask);
@@ -2837,6 +2884,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 			env->insn_aux_data[insn_idx].sanitize_stack_spill = true;
 	}
 
+	mark_stack_slot_scratched(env, spi);
 	if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) &&
 	    !register_is_null(reg) && env->bpf_capable) {
 		if (dst_reg != BPF_REG_FP) {
@@ -2958,6 +3006,7 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
 		slot = -i - 1;
 		spi = slot / BPF_REG_SIZE;
 		stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
+		mark_stack_slot_scratched(env, spi);
 
 		if (!env->allow_ptr_leaks
 				&& *stype != NOT_INIT
@@ -3376,7 +3425,7 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno,
 	 * to make sure our theoretical access will be safe.
 	 */
 	if (env->log.level & BPF_LOG_LEVEL)
-		print_verifier_state(env, state);
+		print_verifier_state(env, state, false);
 
 	/* The minimum value is only important with signed
 	 * comparisons where we can't assume the floor of a
@@ -6011,9 +6060,9 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 
 	if (env->log.level & BPF_LOG_LEVEL) {
 		verbose(env, "caller:\n");
-		print_verifier_state(env, caller);
+		print_verifier_state(env, caller, true);
 		verbose(env, "callee:\n");
-		print_verifier_state(env, callee);
+		print_verifier_state(env, callee, true);
 	}
 	return 0;
 }
@@ -6228,9 +6277,9 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 	*insn_idx = callee->callsite + 1;
 	if (env->log.level & BPF_LOG_LEVEL) {
 		verbose(env, "returning from callee:\n");
-		print_verifier_state(env, callee);
+		print_verifier_state(env, callee, true);
 		verbose(env, "to caller at %d:\n", *insn_idx);
-		print_verifier_state(env, caller);
+		print_verifier_state(env, caller, true);
 	}
 	/* clear everything in the callee */
 	free_func_state(callee);
@@ -8249,12 +8298,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 
 	/* Got here implies adding two SCALAR_VALUEs */
 	if (WARN_ON_ONCE(ptr_reg)) {
-		print_verifier_state(env, state);
+		print_verifier_state(env, state, true);
 		verbose(env, "verifier internal error: unexpected ptr_reg\n");
 		return -EINVAL;
 	}
 	if (WARN_ON(!src_reg)) {
-		print_verifier_state(env, state);
+		print_verifier_state(env, state, true);
 		verbose(env, "verifier internal error: no src_reg\n");
 		return -EINVAL;
 	}
@@ -9389,7 +9438,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 		return -EACCES;
 	}
 	if (env->log.level & BPF_LOG_LEVEL)
-		print_verifier_state(env, this_branch->frame[this_branch->curframe]);
+		print_verifier_state(env, this_branch->frame[this_branch->curframe], false);
 	return 0;
 }
 
@@ -11259,14 +11308,17 @@ static int do_check(struct bpf_verifier_env *env)
 
 		if (env->log.level & BPF_LOG_LEVEL2 ||
 		    (env->log.level & BPF_LOG_LEVEL && do_print_state)) {
-			if (env->log.level & BPF_LOG_LEVEL2)
-				verbose(env, "%d:", env->insn_idx);
-			else
+			if (env->log.level & BPF_LOG_LEVEL2) {
+				if (verifier_state_scratched(env))
+					verbose(env, "%d:", env->insn_idx);
+			} else {
 				verbose(env, "\nfrom %d to %d%s:",
 					env->prev_insn_idx, env->insn_idx,
 					env->cur_state->speculative ?
 					" (speculative execution)" : "");
-			print_verifier_state(env, state->frame[state->curframe]);
+			}
+			print_verifier_state(env, state->frame[state->curframe],
+					     false);
 			do_print_state = false;
 		}
 
@@ -11488,6 +11540,7 @@ static int do_check(struct bpf_verifier_env *env)
 				if (err)
 					return err;
 process_bpf_exit:
+				mark_verifier_state_scratched(env);
 				update_branch_counts(env, env->cur_state);
 				err = pop_stack(env, &prev_insn_idx,
 						&env->insn_idx, pop_log);
@@ -14148,6 +14201,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
 		}
 	}
 
+	mark_verifier_state_clean(env);
+
 	if (IS_ERR(btf_vmlinux)) {
 		/* Either gcc or pahole or kernel are broken. */
 		verbose(env, "in-kernel BTF is malformed\n");
diff --git a/tools/testing/selftests/bpf/prog_tests/align.c b/tools/testing/selftests/bpf/prog_tests/align.c
index 837f67c6bfda..aeb2080a67f7 100644
--- a/tools/testing/selftests/bpf/prog_tests/align.c
+++ b/tools/testing/selftests/bpf/prog_tests/align.c
@@ -39,8 +39,8 @@ static struct bpf_align_test tests[] = {
 		},
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.matches = {
-			{1, "R1=ctx(id=0,off=0,imm=0)"},
-			{1, "R10=fp0"},
+			{0, "R1=ctx(id=0,off=0,imm=0)"},
+			{0, "R10=fp0"},
 			{1, "R3_w=inv2"},
 			{2, "R3_w=inv4"},
 			{3, "R3_w=inv8"},
@@ -67,8 +67,8 @@ static struct bpf_align_test tests[] = {
 		},
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.matches = {
-			{1, "R1=ctx(id=0,off=0,imm=0)"},
-			{1, "R10=fp0"},
+			{0, "R1=ctx(id=0,off=0,imm=0)"},
+			{0, "R10=fp0"},
 			{1, "R3_w=inv1"},
 			{2, "R3_w=inv2"},
 			{3, "R3_w=inv4"},
@@ -96,8 +96,8 @@ static struct bpf_align_test tests[] = {
 		},
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.matches = {
-			{1, "R1=ctx(id=0,off=0,imm=0)"},
-			{1, "R10=fp0"},
+			{0, "R1=ctx(id=0,off=0,imm=0)"},
+			{0, "R10=fp0"},
 			{1, "R3_w=inv4"},
 			{2, "R3_w=inv8"},
 			{3, "R3_w=inv10"},
@@ -118,8 +118,8 @@ static struct bpf_align_test tests[] = {
 		},
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.matches = {
-			{1, "R1=ctx(id=0,off=0,imm=0)"},
-			{1, "R10=fp0"},
+			{0, "R1=ctx(id=0,off=0,imm=0)"},
+			{0, "R10=fp0"},
 			{1, "R3_w=inv7"},
 			{2, "R3_w=inv7"},
 			{3, "R3_w=inv14"},
@@ -161,13 +161,13 @@ static struct bpf_align_test tests[] = {
 		},
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.matches = {
-			{7, "R0_w=pkt(id=0,off=8,r=8,imm=0)"},
+			{6, "R0_w=pkt(id=0,off=8,r=8,imm=0)"},
 			{7, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
 			{8, "R3_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"},
 			{9, "R3_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
 			{10, "R3_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"},
 			{11, "R3_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"},
-			{18, "R3=pkt_end(id=0,off=0,imm=0)"},
+			{13, "R3_w=pkt_end(id=0,off=0,imm=0)"},
 			{18, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
 			{19, "R4_w=inv(id=0,umax_value=8160,var_off=(0x0; 0x1fe0))"},
 			{20, "R4_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"},
@@ -234,10 +234,10 @@ static struct bpf_align_test tests[] = {
 		},
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.matches = {
-			{4, "R5_w=pkt(id=0,off=0,r=0,imm=0)"},
+			{3, "R5_w=pkt(id=0,off=0,r=0,imm=0)"},
 			{5, "R5_w=pkt(id=0,off=14,r=0,imm=0)"},
 			{6, "R4_w=pkt(id=0,off=14,r=0,imm=0)"},
-			{10, "R2=pkt(id=0,off=0,r=18,imm=0)"},
+			{9, "R2=pkt(id=0,off=0,r=18,imm=0)"},
 			{10, "R5=pkt(id=0,off=14,r=18,imm=0)"},
 			{10, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
 			{14, "R4_w=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff))"},
@@ -296,7 +296,7 @@ static struct bpf_align_test tests[] = {
 			/* Calculated offset in R6 has unknown value, but known
 			 * alignment of 4.
 			 */
-			{8, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
+			{6, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
 			{8, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
 			/* Offset is added to packet pointer R5, resulting in
 			 * known fixed offset, and variable offset from R6.
@@ -386,7 +386,7 @@ static struct bpf_align_test tests[] = {
 			/* Calculated offset in R6 has unknown value, but known
 			 * alignment of 4.
 			 */
-			{8, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
+			{6, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
 			{8, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
 			/* Adding 14 makes R6 be (4n+2) */
 			{9, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
@@ -458,7 +458,7 @@ static struct bpf_align_test tests[] = {
 			/* Checked s>=0 */
 			{9, "R5=inv(id=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"},
 			/* packet pointer + nonnegative (4n+2) */
-			{11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"},
+			{12, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"},
 			{13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"},
 			/* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine.
 			 * We checked the bounds, but it might have been able
-- 
cgit v1.2.3


From 2e5766483c8c5cf886b4dc647a1741738dde7d79 Mon Sep 17 00:00:00 2001
From: Christy Lee <christylee@fb.com>
Date: Thu, 16 Dec 2021 19:42:45 -0800
Subject: bpf: Right align verifier states in verifier logs.

Make the verifier logs more readable, print the verifier states
on the corresponding instruction line. If the previous line was
not a bpf instruction, then print the verifier states on its own
line.

Before:

Validating test_pkt_access_subprog3() func#3...
86: R1=invP(id=0) R2=ctx(id=0,off=0,imm=0) R10=fp0
; int test_pkt_access_subprog3(int val, struct __sk_buff *skb)
86: (bf) r6 = r2
87: R2=ctx(id=0,off=0,imm=0) R6_w=ctx(id=0,off=0,imm=0)
87: (bc) w7 = w1
88: R1=invP(id=0) R7_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff))
; return get_skb_len(skb) * get_skb_ifindex(val, skb, get_constant(123));
88: (bf) r1 = r6
89: R1_w=ctx(id=0,off=0,imm=0) R6_w=ctx(id=0,off=0,imm=0)
89: (85) call pc+9
Func#4 is global and valid. Skipping.
90: R0_w=invP(id=0)
90: (bc) w8 = w0
91: R0_w=invP(id=0) R8_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff))
; return get_skb_len(skb) * get_skb_ifindex(val, skb, get_constant(123));
91: (b7) r1 = 123
92: R1_w=invP123
92: (85) call pc+65
Func#5 is global and valid. Skipping.
93: R0=invP(id=0)

After:

86: R1=invP(id=0) R2=ctx(id=0,off=0,imm=0) R10=fp0
; int test_pkt_access_subprog3(int val, struct __sk_buff *skb)
86: (bf) r6 = r2                      ; R2=ctx(id=0,off=0,imm=0) R6_w=ctx(id=0,off=0,imm=0)
87: (bc) w7 = w1                      ; R1=invP(id=0) R7_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff))
; return get_skb_len(skb) * get_skb_ifindex(val, skb, get_constant(123));
88: (bf) r1 = r6                      ; R1_w=ctx(id=0,off=0,imm=0) R6_w=ctx(id=0,off=0,imm=0)
89: (85) call pc+9
Func#4 is global and valid. Skipping.
90: R0_w=invP(id=0)
90: (bc) w8 = w0                      ; R0_w=invP(id=0) R8_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff))
; return get_skb_len(skb) * get_skb_ifindex(val, skb, get_constant(123));
91: (b7) r1 = 123                     ; R1_w=invP123
92: (85) call pc+65
Func#5 is global and valid. Skipping.
93: R0=invP(id=0)

Signed-off-by: Christy Lee <christylee@fb.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h                   |   3 +
 kernel/bpf/verifier.c                          |  57 ++++++---
 tools/testing/selftests/bpf/prog_tests/align.c | 169 ++++++++++++++-----------
 3 files changed, 131 insertions(+), 98 deletions(-)

(limited to 'tools')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index c66f238c538d..ee931398f311 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -388,6 +388,8 @@ static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
 #define BPF_LOG_LEVEL	(BPF_LOG_LEVEL1 | BPF_LOG_LEVEL2)
 #define BPF_LOG_MASK	(BPF_LOG_LEVEL | BPF_LOG_STATS)
 #define BPF_LOG_KERNEL	(BPF_LOG_MASK + 1) /* kernel internal flag */
+#define BPF_LOG_MIN_ALIGNMENT 8U
+#define BPF_LOG_ALIGNMENT 40U
 
 static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 {
@@ -481,6 +483,7 @@ struct bpf_verifier_env {
 	u32 scratched_regs;
 	/* Same as scratched_regs but for stack slots */
 	u64 scratched_stack_slots;
+	u32 prev_log_len, prev_insn_print_len;
 };
 
 __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ded6e816dcd9..72c4a6bbb5bc 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -797,6 +797,25 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 	mark_verifier_state_clean(env);
 }
 
+static inline u32 vlog_alignment(u32 pos)
+{
+	return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT),
+			BPF_LOG_MIN_ALIGNMENT) - pos - 1;
+}
+
+static void print_insn_state(struct bpf_verifier_env *env,
+			     const struct bpf_func_state *state)
+{
+	if (env->prev_log_len && env->prev_log_len == env->log.len_used) {
+		/* remove new line character */
+		bpf_vlog_reset(&env->log, env->prev_log_len - 1);
+		verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_len), ' ');
+	} else {
+		verbose(env, "%d:", env->insn_idx);
+	}
+	print_verifier_state(env, state, false);
+}
+
 /* copy array src of length n * size bytes to dst. dst is reallocated if it's too
  * small to hold src. This is different from krealloc since we don't want to preserve
  * the contents of dst.
@@ -2725,10 +2744,10 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
 			reg->precise = true;
 		}
 		if (env->log.level & BPF_LOG_LEVEL) {
-			print_verifier_state(env, func, false);
-			verbose(env, "parent %s regs=%x stack=%llx marks\n",
+			verbose(env, "parent %s regs=%x stack=%llx marks:",
 				new_marks ? "didn't have" : "already had",
 				reg_mask, stack_mask);
+			print_verifier_state(env, func, true);
 		}
 
 		if (!reg_mask && !stack_mask)
@@ -3423,11 +3442,8 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno,
 	/* We may have adjusted the register pointing to memory region, so we
 	 * need to try adding each of min_value and max_value to off
 	 * to make sure our theoretical access will be safe.
-	 */
-	if (env->log.level & BPF_LOG_LEVEL)
-		print_verifier_state(env, state, false);
-
-	/* The minimum value is only important with signed
+	 *
+	 * The minimum value is only important with signed
 	 * comparisons where we can't assume the floor of a
 	 * value is 0.  If we are using signed variables for our
 	 * index'es we need to make sure that whatever we use
@@ -9438,7 +9454,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 		return -EACCES;
 	}
 	if (env->log.level & BPF_LOG_LEVEL)
-		print_verifier_state(env, this_branch->frame[this_branch->curframe], false);
+		print_insn_state(env, this_branch->frame[this_branch->curframe]);
 	return 0;
 }
 
@@ -11306,19 +11322,12 @@ static int do_check(struct bpf_verifier_env *env)
 		if (need_resched())
 			cond_resched();
 
-		if (env->log.level & BPF_LOG_LEVEL2 ||
-		    (env->log.level & BPF_LOG_LEVEL && do_print_state)) {
-			if (env->log.level & BPF_LOG_LEVEL2) {
-				if (verifier_state_scratched(env))
-					verbose(env, "%d:", env->insn_idx);
-			} else {
-				verbose(env, "\nfrom %d to %d%s:",
-					env->prev_insn_idx, env->insn_idx,
-					env->cur_state->speculative ?
-					" (speculative execution)" : "");
-			}
-			print_verifier_state(env, state->frame[state->curframe],
-					     false);
+		if (env->log.level & BPF_LOG_LEVEL2 && do_print_state) {
+			verbose(env, "\nfrom %d to %d%s:",
+				env->prev_insn_idx, env->insn_idx,
+				env->cur_state->speculative ?
+				" (speculative execution)" : "");
+			print_verifier_state(env, state->frame[state->curframe], true);
 			do_print_state = false;
 		}
 
@@ -11329,9 +11338,15 @@ static int do_check(struct bpf_verifier_env *env)
 				.private_data	= env,
 			};
 
+			if (verifier_state_scratched(env))
+				print_insn_state(env, state->frame[state->curframe]);
+
 			verbose_linfo(env, env->insn_idx, "; ");
+			env->prev_log_len = env->log.len_used;
 			verbose(env, "%d: ", env->insn_idx);
 			print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
+			env->prev_insn_print_len = env->log.len_used - env->prev_log_len;
+			env->prev_log_len = env->log.len_used;
 		}
 
 		if (bpf_prog_is_dev_bound(env->prog->aux)) {
diff --git a/tools/testing/selftests/bpf/prog_tests/align.c b/tools/testing/selftests/bpf/prog_tests/align.c
index aeb2080a67f7..0ee29e11eaee 100644
--- a/tools/testing/selftests/bpf/prog_tests/align.c
+++ b/tools/testing/selftests/bpf/prog_tests/align.c
@@ -41,11 +41,11 @@ static struct bpf_align_test tests[] = {
 		.matches = {
 			{0, "R1=ctx(id=0,off=0,imm=0)"},
 			{0, "R10=fp0"},
-			{1, "R3_w=inv2"},
-			{2, "R3_w=inv4"},
-			{3, "R3_w=inv8"},
-			{4, "R3_w=inv16"},
-			{5, "R3_w=inv32"},
+			{0, "R3_w=inv2"},
+			{1, "R3_w=inv4"},
+			{2, "R3_w=inv8"},
+			{3, "R3_w=inv16"},
+			{4, "R3_w=inv32"},
 		},
 	},
 	{
@@ -69,17 +69,17 @@ static struct bpf_align_test tests[] = {
 		.matches = {
 			{0, "R1=ctx(id=0,off=0,imm=0)"},
 			{0, "R10=fp0"},
-			{1, "R3_w=inv1"},
-			{2, "R3_w=inv2"},
-			{3, "R3_w=inv4"},
-			{4, "R3_w=inv8"},
-			{5, "R3_w=inv16"},
-			{6, "R3_w=inv1"},
-			{7, "R4_w=inv32"},
-			{8, "R4_w=inv16"},
-			{9, "R4_w=inv8"},
-			{10, "R4_w=inv4"},
-			{11, "R4_w=inv2"},
+			{0, "R3_w=inv1"},
+			{1, "R3_w=inv2"},
+			{2, "R3_w=inv4"},
+			{3, "R3_w=inv8"},
+			{4, "R3_w=inv16"},
+			{5, "R3_w=inv1"},
+			{6, "R4_w=inv32"},
+			{7, "R4_w=inv16"},
+			{8, "R4_w=inv8"},
+			{9, "R4_w=inv4"},
+			{10, "R4_w=inv2"},
 		},
 	},
 	{
@@ -98,12 +98,12 @@ static struct bpf_align_test tests[] = {
 		.matches = {
 			{0, "R1=ctx(id=0,off=0,imm=0)"},
 			{0, "R10=fp0"},
-			{1, "R3_w=inv4"},
-			{2, "R3_w=inv8"},
-			{3, "R3_w=inv10"},
-			{4, "R4_w=inv8"},
-			{5, "R4_w=inv12"},
-			{6, "R4_w=inv14"},
+			{0, "R3_w=inv4"},
+			{1, "R3_w=inv8"},
+			{2, "R3_w=inv10"},
+			{3, "R4_w=inv8"},
+			{4, "R4_w=inv12"},
+			{5, "R4_w=inv14"},
 		},
 	},
 	{
@@ -120,10 +120,10 @@ static struct bpf_align_test tests[] = {
 		.matches = {
 			{0, "R1=ctx(id=0,off=0,imm=0)"},
 			{0, "R10=fp0"},
+			{0, "R3_w=inv7"},
 			{1, "R3_w=inv7"},
-			{2, "R3_w=inv7"},
-			{3, "R3_w=inv14"},
-			{4, "R3_w=inv56"},
+			{2, "R3_w=inv14"},
+			{3, "R3_w=inv56"},
 		},
 	},
 
@@ -162,18 +162,18 @@ static struct bpf_align_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.matches = {
 			{6, "R0_w=pkt(id=0,off=8,r=8,imm=0)"},
-			{7, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
-			{8, "R3_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"},
-			{9, "R3_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
-			{10, "R3_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"},
-			{11, "R3_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"},
-			{13, "R3_w=pkt_end(id=0,off=0,imm=0)"},
-			{18, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
-			{19, "R4_w=inv(id=0,umax_value=8160,var_off=(0x0; 0x1fe0))"},
-			{20, "R4_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"},
-			{21, "R4_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"},
-			{22, "R4_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
-			{23, "R4_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"},
+			{6, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+			{7, "R3_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"},
+			{8, "R3_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+			{9, "R3_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"},
+			{10, "R3_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"},
+			{12, "R3_w=pkt_end(id=0,off=0,imm=0)"},
+			{17, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+			{18, "R4_w=inv(id=0,umax_value=8160,var_off=(0x0; 0x1fe0))"},
+			{19, "R4_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"},
+			{20, "R4_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"},
+			{21, "R4_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+			{22, "R4_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"},
 		},
 	},
 	{
@@ -194,16 +194,16 @@ static struct bpf_align_test tests[] = {
 		},
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.matches = {
-			{7, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
-			{8, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"},
-			{9, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
-			{10, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"},
-			{11, "R4_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"},
-			{12, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"},
-			{13, "R4_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
-			{14, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"},
-			{15, "R4_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"},
-			{16, "R4_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"},
+			{6, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+			{7, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"},
+			{8, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+			{9, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"},
+			{10, "R4_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"},
+			{11, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"},
+			{12, "R4_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+			{13, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"},
+			{14, "R4_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"},
+			{15, "R4_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"},
 		},
 	},
 	{
@@ -234,14 +234,14 @@ static struct bpf_align_test tests[] = {
 		},
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.matches = {
-			{3, "R5_w=pkt(id=0,off=0,r=0,imm=0)"},
-			{5, "R5_w=pkt(id=0,off=14,r=0,imm=0)"},
-			{6, "R4_w=pkt(id=0,off=14,r=0,imm=0)"},
+			{2, "R5_w=pkt(id=0,off=0,r=0,imm=0)"},
+			{4, "R5_w=pkt(id=0,off=14,r=0,imm=0)"},
+			{5, "R4_w=pkt(id=0,off=14,r=0,imm=0)"},
 			{9, "R2=pkt(id=0,off=0,r=18,imm=0)"},
 			{10, "R5=pkt(id=0,off=14,r=18,imm=0)"},
 			{10, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+			{13, "R4_w=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff))"},
 			{14, "R4_w=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff))"},
-			{15, "R4_w=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff))"},
 		},
 	},
 	{
@@ -297,7 +297,7 @@ static struct bpf_align_test tests[] = {
 			 * alignment of 4.
 			 */
 			{6, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
-			{8, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+			{7, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
 			/* Offset is added to packet pointer R5, resulting in
 			 * known fixed offset, and variable offset from R6.
 			 */
@@ -313,11 +313,11 @@ static struct bpf_align_test tests[] = {
 			/* Variable offset is added to R5 packet pointer,
 			 * resulting in auxiliary alignment of 4.
 			 */
-			{18, "R5_w=pkt(id=2,off=0,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+			{17, "R5_w=pkt(id=2,off=0,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
 			/* Constant offset is added to R5, resulting in
 			 * reg->off of 14.
 			 */
-			{19, "R5_w=pkt(id=2,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+			{18, "R5_w=pkt(id=2,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
 			/* At the time the word size load is performed from R5,
 			 * its total fixed offset is NET_IP_ALIGN + reg->off
 			 * (14) which is 16.  Then the variable offset is 4-byte
@@ -329,18 +329,18 @@ static struct bpf_align_test tests[] = {
 			/* Constant offset is added to R5 packet pointer,
 			 * resulting in reg->off value of 14.
 			 */
-			{26, "R5_w=pkt(id=0,off=14,r=8"},
+			{25, "R5_w=pkt(id=0,off=14,r=8"},
 			/* Variable offset is added to R5, resulting in a
 			 * variable offset of (4n).
 			 */
-			{27, "R5_w=pkt(id=3,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+			{26, "R5_w=pkt(id=3,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
 			/* Constant is added to R5 again, setting reg->off to 18. */
-			{28, "R5_w=pkt(id=3,off=18,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+			{27, "R5_w=pkt(id=3,off=18,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
 			/* And once more we add a variable; resulting var_off
 			 * is still (4n), fixed offset is not changed.
 			 * Also, we create a new reg->id.
 			 */
-			{29, "R5_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc)"},
+			{28, "R5_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc)"},
 			/* At the time the word size load is performed from R5,
 			 * its total fixed offset is NET_IP_ALIGN + reg->off (18)
 			 * which is 20.  Then the variable offset is (4n), so
@@ -387,12 +387,12 @@ static struct bpf_align_test tests[] = {
 			 * alignment of 4.
 			 */
 			{6, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
-			{8, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+			{7, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
 			/* Adding 14 makes R6 be (4n+2) */
-			{9, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
+			{8, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
 			/* Packet pointer has (4n+2) offset */
 			{11, "R5_w=pkt(id=1,off=0,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"},
-			{13, "R4=pkt(id=1,off=4,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"},
+			{12, "R4=pkt(id=1,off=4,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"},
 			/* At the time the word size load is performed from R5,
 			 * its total fixed offset is NET_IP_ALIGN + reg->off (0)
 			 * which is 2.  Then the variable offset is (4n+2), so
@@ -403,12 +403,12 @@ static struct bpf_align_test tests[] = {
 			/* Newly read value in R6 was shifted left by 2, so has
 			 * known alignment of 4.
 			 */
-			{18, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+			{17, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
 			/* Added (4n) to packet pointer's (4n+2) var_off, giving
 			 * another (4n+2).
 			 */
 			{19, "R5_w=pkt(id=2,off=0,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"},
-			{21, "R4=pkt(id=2,off=4,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"},
+			{20, "R4=pkt(id=2,off=4,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"},
 			/* At the time the word size load is performed from R5,
 			 * its total fixed offset is NET_IP_ALIGN + reg->off (0)
 			 * which is 2.  Then the variable offset is (4n+2), so
@@ -448,18 +448,18 @@ static struct bpf_align_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.result = REJECT,
 		.matches = {
-			{4, "R5_w=pkt_end(id=0,off=0,imm=0)"},
+			{3, "R5_w=pkt_end(id=0,off=0,imm=0)"},
 			/* (ptr - ptr) << 2 == unknown, (4n) */
-			{6, "R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc)"},
+			{5, "R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc)"},
 			/* (4n) + 14 == (4n+2).  We blow our bounds, because
 			 * the add could overflow.
 			 */
-			{7, "R5_w=inv(id=0,smin_value=-9223372036854775806,smax_value=9223372036854775806,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"},
+			{6, "R5_w=inv(id=0,smin_value=-9223372036854775806,smax_value=9223372036854775806,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"},
 			/* Checked s>=0 */
 			{9, "R5=inv(id=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"},
 			/* packet pointer + nonnegative (4n+2) */
-			{12, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"},
-			{13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"},
+			{11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"},
+			{12, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"},
 			/* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine.
 			 * We checked the bounds, but it might have been able
 			 * to overflow if the packet pointer started in the
@@ -502,14 +502,14 @@ static struct bpf_align_test tests[] = {
 			/* Calculated offset in R6 has unknown value, but known
 			 * alignment of 4.
 			 */
-			{7, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
-			{9, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+			{6, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
+			{8, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
 			/* Adding 14 makes R6 be (4n+2) */
-			{10, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
+			{9, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
 			/* New unknown value in R7 is (4n) */
-			{11, "R7_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+			{10, "R7_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
 			/* Subtracting it from R6 blows our unsigned bounds */
-			{12, "R6=inv(id=0,smin_value=-1006,smax_value=1034,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"},
+			{11, "R6=inv(id=0,smin_value=-1006,smax_value=1034,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"},
 			/* Checked s>= 0 */
 			{14, "R6=inv(id=0,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc))"},
 			/* At the time the word size load is performed from R5,
@@ -556,14 +556,14 @@ static struct bpf_align_test tests[] = {
 			/* Calculated offset in R6 has unknown value, but known
 			 * alignment of 4.
 			 */
-			{7, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
-			{10, "R6_w=inv(id=0,umax_value=60,var_off=(0x0; 0x3c))"},
+			{6, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
+			{9, "R6_w=inv(id=0,umax_value=60,var_off=(0x0; 0x3c))"},
 			/* Adding 14 makes R6 be (4n+2) */
-			{11, "R6_w=inv(id=0,umin_value=14,umax_value=74,var_off=(0x2; 0x7c))"},
+			{10, "R6_w=inv(id=0,umin_value=14,umax_value=74,var_off=(0x2; 0x7c))"},
 			/* Subtracting from packet pointer overflows ubounds */
 			{13, "R5_w=pkt(id=2,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c)"},
 			/* New unknown value in R7 is (4n), >= 76 */
-			{15, "R7_w=inv(id=0,umin_value=76,umax_value=1096,var_off=(0x0; 0x7fc))"},
+			{14, "R7_w=inv(id=0,umin_value=76,umax_value=1096,var_off=(0x0; 0x7fc))"},
 			/* Adding it to packet pointer gives nice bounds again */
 			{16, "R5_w=pkt(id=3,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"},
 			/* At the time the word size load is performed from R5,
@@ -625,12 +625,15 @@ static int do_test_single(struct bpf_align_test *test)
 		line_ptr = strtok(bpf_vlog_copy, "\n");
 		for (i = 0; i < MAX_MATCHES; i++) {
 			struct bpf_reg_match m = test->matches[i];
+			int tmp;
 
 			if (!m.match)
 				break;
 			while (line_ptr) {
 				cur_line = -1;
 				sscanf(line_ptr, "%u: ", &cur_line);
+				if (cur_line == -1)
+					sscanf(line_ptr, "from %u to %u: ", &tmp, &cur_line);
 				if (cur_line == m.line)
 					break;
 				line_ptr = strtok(NULL, "\n");
@@ -642,7 +645,19 @@ static int do_test_single(struct bpf_align_test *test)
 				printf("%s", bpf_vlog);
 				break;
 			}
+			/* Check the next line as well in case the previous line
+			 * did not have a corresponding bpf insn. Example:
+			 * func#0 @0
+			 * 0: R1=ctx(id=0,off=0,imm=0) R10=fp0
+			 * 0: (b7) r3 = 2                 ; R3_w=inv2
+			 */
 			if (!strstr(line_ptr, m.match)) {
+				cur_line = -1;
+				line_ptr = strtok(NULL, "\n");
+				sscanf(line_ptr, "%u: ", &cur_line);
+			}
+			if (cur_line != m.line || !line_ptr ||
+			    !strstr(line_ptr, m.match)) {
 				printf("Failed to find match %u: %s\n",
 				       m.line, m.match);
 				ret = 1;
-- 
cgit v1.2.3


From 16c138f338b62dd0a339b9d0ca819426231be7f6 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <brgl@bgdev.pl>
Date: Mon, 13 Dec 2021 11:21:56 +0100
Subject: selftests: gpio: provide a helper for reading chip info

Add a simple program that allows to retrieve chip properties from the
GPIO character device. This will be used in gpio-sim selftests.

Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
---
 tools/testing/selftests/gpio/.gitignore       |  1 +
 tools/testing/selftests/gpio/Makefile         |  2 +-
 tools/testing/selftests/gpio/gpio-chip-info.c | 57 +++++++++++++++++++++++++++
 3 files changed, 59 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/gpio/gpio-chip-info.c

(limited to 'tools')

diff --git a/tools/testing/selftests/gpio/.gitignore b/tools/testing/selftests/gpio/.gitignore
index a4969f7ee020..4ea4f58dab1a 100644
--- a/tools/testing/selftests/gpio/.gitignore
+++ b/tools/testing/selftests/gpio/.gitignore
@@ -1,2 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
 gpio-mockup-cdev
+gpio-chip-info
diff --git a/tools/testing/selftests/gpio/Makefile b/tools/testing/selftests/gpio/Makefile
index 39f2bbe8dd3d..84b48547f94c 100644
--- a/tools/testing/selftests/gpio/Makefile
+++ b/tools/testing/selftests/gpio/Makefile
@@ -2,6 +2,6 @@
 
 TEST_PROGS := gpio-mockup.sh
 TEST_FILES := gpio-mockup-sysfs.sh
-TEST_GEN_PROGS_EXTENDED := gpio-mockup-cdev
+TEST_GEN_PROGS_EXTENDED := gpio-mockup-cdev gpio-chip-info
 
 include ../lib.mk
diff --git a/tools/testing/selftests/gpio/gpio-chip-info.c b/tools/testing/selftests/gpio/gpio-chip-info.c
new file mode 100644
index 000000000000..fdc07e742fba
--- /dev/null
+++ b/tools/testing/selftests/gpio/gpio-chip-info.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * GPIO character device helper for reading chip information.
+ *
+ * Copyright (C) 2021 Bartosz Golaszewski <brgl@bgdev.pl>
+ */
+
+#include <fcntl.h>
+#include <linux/gpio.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+
+static void print_usage(void)
+{
+	printf("usage:\n");
+	printf("  gpio-chip-info <chip path> [name|label|num-lines]\n");
+}
+
+int main(int argc, char **argv)
+{
+	struct gpiochip_info info;
+	int fd, ret;
+
+	if (argc != 3) {
+		print_usage();
+		return EXIT_FAILURE;
+	}
+
+	fd = open(argv[1], O_RDWR);
+	if (fd < 0) {
+		perror("unable to open the GPIO chip");
+		return EXIT_FAILURE;
+	}
+
+	memset(&info, 0, sizeof(info));
+	ret = ioctl(fd, GPIO_GET_CHIPINFO_IOCTL, &info);
+	if (ret) {
+		perror("chip info ioctl failed");
+		return EXIT_FAILURE;
+	}
+
+	if (strcmp(argv[2], "name") == 0) {
+		printf("%s\n", info.name);
+	} else if (strcmp(argv[2], "label") == 0) {
+		printf("%s\n", info.label);
+	} else if (strcmp(argv[2], "num-lines") == 0) {
+		printf("%u\n", info.lines);
+	} else {
+		fprintf(stderr, "unknown command: %s\n", argv[2]);
+		return EXIT_FAILURE;
+	}
+
+	return EXIT_SUCCESS;
+}
-- 
cgit v1.2.3


From b2bb90c80a3e9ba6d479aaf130277d6f96a87161 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <brgl@bgdev.pl>
Date: Mon, 13 Dec 2021 13:42:20 +0100
Subject: selftests: gpio: add a helper for reading GPIO line names

Add a simple program that allows to read GPIO line names from the
character device. This will be used in gpio-sim selftests.

Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
---
 tools/testing/selftests/gpio/.gitignore       |  1 +
 tools/testing/selftests/gpio/Makefile         |  2 +-
 tools/testing/selftests/gpio/gpio-line-name.c | 55 +++++++++++++++++++++++++++
 3 files changed, 57 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/gpio/gpio-line-name.c

(limited to 'tools')

diff --git a/tools/testing/selftests/gpio/.gitignore b/tools/testing/selftests/gpio/.gitignore
index 4ea4f58dab1a..ededb077a3a6 100644
--- a/tools/testing/selftests/gpio/.gitignore
+++ b/tools/testing/selftests/gpio/.gitignore
@@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
 gpio-mockup-cdev
 gpio-chip-info
+gpio-line-name
diff --git a/tools/testing/selftests/gpio/Makefile b/tools/testing/selftests/gpio/Makefile
index 84b48547f94c..d7d8f1985d99 100644
--- a/tools/testing/selftests/gpio/Makefile
+++ b/tools/testing/selftests/gpio/Makefile
@@ -2,6 +2,6 @@
 
 TEST_PROGS := gpio-mockup.sh
 TEST_FILES := gpio-mockup-sysfs.sh
-TEST_GEN_PROGS_EXTENDED := gpio-mockup-cdev gpio-chip-info
+TEST_GEN_PROGS_EXTENDED := gpio-mockup-cdev gpio-chip-info gpio-line-name
 
 include ../lib.mk
diff --git a/tools/testing/selftests/gpio/gpio-line-name.c b/tools/testing/selftests/gpio/gpio-line-name.c
new file mode 100644
index 000000000000..e635cfadbded
--- /dev/null
+++ b/tools/testing/selftests/gpio/gpio-line-name.c
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * GPIO character device helper for reading line names.
+ *
+ * Copyright (C) 2021 Bartosz Golaszewski <brgl@bgdev.pl>
+ */
+
+#include <fcntl.h>
+#include <linux/gpio.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+
+static void print_usage(void)
+{
+	printf("usage:\n");
+	printf("  gpio-line-name <chip path> <line offset>\n");
+}
+
+int main(int argc, char **argv)
+{
+	struct gpio_v2_line_info info;
+	int fd, ret;
+	char *endp;
+
+	if (argc != 3) {
+		print_usage();
+		return EXIT_FAILURE;
+	}
+
+	fd = open(argv[1], O_RDWR);
+	if (fd < 0) {
+		perror("unable to open the GPIO chip");
+		return EXIT_FAILURE;
+	}
+
+	memset(&info, 0, sizeof(info));
+	info.offset = strtoul(argv[2], &endp, 10);
+	if (*endp != '\0') {
+		print_usage();
+		return EXIT_FAILURE;
+	}
+
+	ret = ioctl(fd, GPIO_V2_GET_LINEINFO_IOCTL, &info);
+	if (ret) {
+		perror("line info ioctl failed");
+		return EXIT_FAILURE;
+	}
+
+	printf("%s\n", info.name);
+
+	return EXIT_SUCCESS;
+}
-- 
cgit v1.2.3


From 1d96b8f635d98748e251e8e9625cf42d81514425 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <brgl@bgdev.pl>
Date: Mon, 13 Dec 2021 13:43:14 +0100
Subject: selftests: gpio: add test cases for gpio-sim

Add a set of tests for the new gpio-sim module. This is a pure shell
test-suite and uses the helper programs available in the gpio selftests
directory. These test-cases only test the functionalities exposed by the
gpio-sim driver, not those handled by core gpiolib code.

Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
---
 tools/testing/selftests/gpio/Makefile    |   2 +-
 tools/testing/selftests/gpio/config      |   1 +
 tools/testing/selftests/gpio/gpio-sim.sh | 396 +++++++++++++++++++++++++++++++
 3 files changed, 398 insertions(+), 1 deletion(-)
 create mode 100755 tools/testing/selftests/gpio/gpio-sim.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/gpio/Makefile b/tools/testing/selftests/gpio/Makefile
index d7d8f1985d99..4c6df61c76a8 100644
--- a/tools/testing/selftests/gpio/Makefile
+++ b/tools/testing/selftests/gpio/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
-TEST_PROGS := gpio-mockup.sh
+TEST_PROGS := gpio-mockup.sh gpio-sim.sh
 TEST_FILES := gpio-mockup-sysfs.sh
 TEST_GEN_PROGS_EXTENDED := gpio-mockup-cdev gpio-chip-info gpio-line-name
 
diff --git a/tools/testing/selftests/gpio/config b/tools/testing/selftests/gpio/config
index ce100342c20b..409a8532facc 100644
--- a/tools/testing/selftests/gpio/config
+++ b/tools/testing/selftests/gpio/config
@@ -1,3 +1,4 @@
 CONFIG_GPIOLIB=y
 CONFIG_GPIO_CDEV=y
 CONFIG_GPIO_MOCKUP=m
+CONFIG_GPIO_SIM=m
diff --git a/tools/testing/selftests/gpio/gpio-sim.sh b/tools/testing/selftests/gpio/gpio-sim.sh
new file mode 100755
index 000000000000..d335a975890c
--- /dev/null
+++ b/tools/testing/selftests/gpio/gpio-sim.sh
@@ -0,0 +1,396 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2021 Bartosz Golaszewski <brgl@bgdev.pl>
+
+BASE_DIR=`dirname $0`
+CONFIGFS_DIR="/sys/kernel/config/gpio-sim"
+MODULE="gpio-sim"
+
+fail() {
+	echo "$*" >&2
+	echo "GPIO $MODULE test FAIL"
+	exit 1
+}
+
+skip() {
+	echo "$*" >&2
+	echo "GPIO $MODULE test SKIP"
+	exit 4
+}
+
+remove_chip() {
+	local CHIP=$1
+
+	for FILE in $CONFIGFS_DIR/$CHIP/*; do
+		BANK=`basename $FILE`
+		if [ "$BANK" == "live" ] || [ "$BANK" == "dev_name" ]; then
+			continue
+		fi
+
+		LINES=`ls $CONFIGFS_DIR/$CHIP/$BANK/ | egrep ^line`
+		if [ "$?" == 0 ]; then
+			for LINE in $LINES; do
+				if [ -e $CONFIGFS_DIR/$CHIP/$BANK/$LINE/hog ]; then
+					rmdir $CONFIGFS_DIR/$CHIP/$BANK/$LINE/hog || \
+						fail "Unable to remove the hog"
+				fi
+
+				rmdir $CONFIGFS_DIR/$CHIP/$BANK/$LINE || \
+					fail "Unable to remove the line"
+			done
+		fi
+
+		rmdir $CONFIGFS_DIR/$CHIP/$BANK
+	done
+
+	rmdir $CONFIGFS_DIR/$CHIP || fail "Unable to remove the chip"
+}
+
+configfs_cleanup() {
+	for CHIP in `ls $CONFIGFS_DIR/`; do
+		remove_chip $CHIP
+	done
+}
+
+create_chip() {
+	local CHIP=$1
+
+	mkdir $CONFIGFS_DIR/$CHIP
+}
+
+create_bank() {
+	local CHIP=$1
+	local BANK=$2
+
+	mkdir $CONFIGFS_DIR/$CHIP/$BANK
+}
+
+set_label() {
+	local CHIP=$1
+	local BANK=$2
+	local LABEL=$3
+
+	echo $LABEL > $CONFIGFS_DIR/$CHIP/$BANK/label || fail "Unable to set the chip label"
+}
+
+set_num_lines() {
+	local CHIP=$1
+	local BANK=$2
+	local NUM_LINES=$3
+
+	echo $NUM_LINES > $CONFIGFS_DIR/$CHIP/$BANK/num_lines || \
+		fail "Unable to set the number of lines"
+}
+
+set_line_name() {
+	local CHIP=$1
+	local BANK=$2
+	local OFFSET=$3
+	local NAME=$4
+	local LINE_DIR=$CONFIGFS_DIR/$CHIP/$BANK/line$OFFSET
+
+	test -d $LINE_DIR || mkdir $LINE_DIR
+	echo $NAME > $LINE_DIR/name || fail "Unable to set the line name"
+}
+
+enable_chip() {
+	local CHIP=$1
+
+	echo 1 > $CONFIGFS_DIR/$CHIP/live || fail "Unable to enable the chip"
+}
+
+disable_chip() {
+	local CHIP=$1
+
+	echo 0 > $CONFIGFS_DIR/$CHIP/live || fail "Unable to disable the chip"
+}
+
+configfs_chip_name() {
+	local CHIP=$1
+	local BANK=$2
+
+	cat $CONFIGFS_DIR/$CHIP/$BANK/chip_name 2> /dev/null || \
+		fail "unable to read the chip name from configfs"
+}
+
+configfs_dev_name() {
+	local CHIP=$1
+
+	cat $CONFIGFS_DIR/$CHIP/dev_name 2> /dev/null || \
+		fail "unable to read the device name from configfs"
+}
+
+get_chip_num_lines() {
+	local CHIP=$1
+	local BANK=$2
+
+	$BASE_DIR/gpio-chip-info /dev/`configfs_chip_name $CHIP $BANK` num-lines || \
+		fail "unable to read the number of lines from the character device"
+}
+
+get_chip_label() {
+	local CHIP=$1
+	local BANK=$2
+
+	$BASE_DIR/gpio-chip-info /dev/`configfs_chip_name $CHIP $BANK` label || \
+		fail "unable to read the chip label from the character device"
+}
+
+get_line_name() {
+	local CHIP=$1
+	local BANK=$2
+	local OFFSET=$3
+
+	$BASE_DIR/gpio-line-name /dev/`configfs_chip_name $CHIP $BANK` $OFFSET || \
+		fail "unable to read the line name from the character device"
+}
+
+sysfs_set_pull() {
+	local DEV=$1
+	local BANK=$2
+	local OFFSET=$3
+	local PULL=$4
+	local DEVNAME=`configfs_dev_name $DEV`
+	local CHIPNAME=`configfs_chip_name $DEV $BANK`
+	local SYSFSPATH="/sys/devices/platform/$DEVNAME/$CHIPNAME/sim_gpio$OFFSET/pull"
+
+	echo $PULL > $SYSFSPATH || fail "Unable to set line pull in sysfs"
+}
+
+# Load the gpio-sim module. This will pull in configfs if needed too.
+modprobe gpio-sim || skip "unable to load the gpio-sim module"
+# Make sure configfs is mounted at /sys/kernel/config. Wait a bit if needed.
+for IDX in `seq 5`; do
+	if [ "$IDX" -eq "5" ]; then
+		skip "configfs not mounted at /sys/kernel/config"
+	fi
+
+	mountpoint -q /sys/kernel/config && break
+	sleep 0.1
+done
+# If the module was already loaded: remove all previous chips
+configfs_cleanup
+
+trap "exit 1" SIGTERM SIGINT
+trap configfs_cleanup EXIT
+
+echo "1. chip_name and dev_name attributes"
+
+echo "1.1. Chip name is communicated to user"
+create_chip chip
+create_bank chip bank
+enable_chip chip
+test -n `cat $CONFIGFS_DIR/chip/bank/chip_name` || fail "chip_name doesn't work"
+remove_chip chip
+
+echo "1.2. chip_name returns 'none' if the chip is still pending"
+create_chip chip
+create_bank chip bank
+test "`cat $CONFIGFS_DIR/chip/bank/chip_name`" = "none" || \
+	fail "chip_name doesn't return 'none' for a pending chip"
+remove_chip chip
+
+echo "1.3. Device name is communicated to user"
+create_chip chip
+create_bank chip bank
+enable_chip chip
+test -n `cat $CONFIGFS_DIR/chip/dev_name` || fail "dev_name doesn't work"
+remove_chip chip
+
+echo "2. Creating and configuring simulated chips"
+
+echo "2.1. Default number of lines is 1"
+create_chip chip
+create_bank chip bank
+enable_chip chip
+test "`get_chip_num_lines chip bank`" = "1" || fail "default number of lines is not 1"
+remove_chip chip
+
+echo "2.2. Number of lines can be specified"
+create_chip chip
+create_bank chip bank
+set_num_lines chip bank 16
+enable_chip chip
+test "`get_chip_num_lines chip bank`" = "16" || fail "number of lines is not 16"
+remove_chip chip
+
+echo "2.3. Label can be set"
+create_chip chip
+create_bank chip bank
+set_label chip bank foobar
+enable_chip chip
+test "`get_chip_label chip bank`" = "foobar" || fail "label is incorrect"
+remove_chip chip
+
+echo "2.4. Label can be left empty"
+create_chip chip
+create_bank chip bank
+enable_chip chip
+test -z "`cat $CONFIGFS_DIR/chip/bank/label`" || fail "label is not empty"
+remove_chip chip
+
+echo "2.5. Line names can be configured"
+create_chip chip
+create_bank chip bank
+set_num_lines chip bank 16
+set_line_name chip bank 0 foo
+set_line_name chip bank 2 bar
+enable_chip chip
+test "`get_line_name chip bank 0`" = "foo" || fail "line name is incorrect"
+test "`get_line_name chip bank 2`" = "bar" || fail "line name is incorrect"
+remove_chip chip
+
+echo "2.6. Line config can remain unused if offset is greater than number of lines"
+create_chip chip
+create_bank chip bank
+set_num_lines chip bank 2
+set_line_name chip bank 5 foobar
+enable_chip chip
+test "`get_line_name chip bank 0`" = "" || fail "line name is incorrect"
+test "`get_line_name chip bank 1`" = "" || fail "line name is incorrect"
+remove_chip chip
+
+echo "2.7. Line configfs directory names are sanitized"
+create_chip chip
+create_bank chip bank
+mkdir $CONFIGFS_DIR/chip/bank/line12foobar 2> /dev/null && \
+	fail "invalid configfs line name accepted"
+mkdir $CONFIGFS_DIR/chip/bank/line_no_offset 2> /dev/null && \
+	fail "invalid configfs line name accepted"
+remove_chip chip
+
+echo "2.8. Multiple chips can be created"
+CHIPS="chip0 chip1 chip2"
+for CHIP in $CHIPS; do
+	create_chip $CHIP
+	create_bank $CHIP bank
+	enable_chip $CHIP
+done
+for CHIP in $CHIPS; do
+	remove_chip $CHIP
+done
+
+echo "2.9. Can't modify settings when chip is live"
+create_chip chip
+create_bank chip bank
+enable_chip chip
+echo foobar > $CONFIGFS_DIR/chip/bank/label 2> /dev/null && \
+	fail "Setting label of a live chip should fail"
+echo 8 > $CONFIGFS_DIR/chip/bank/num_lines 2> /dev/null && \
+	fail "Setting number of lines of a live chip should fail"
+remove_chip chip
+
+echo "2.10. Can't create line items when chip is live"
+create_chip chip
+create_bank chip bank
+enable_chip chip
+mkdir $CONFIGFS_DIR/chip/bank/line0 2> /dev/null && fail "Creating line item should fail"
+remove_chip chip
+
+echo "2.11. Probe errors are propagated to user-space"
+create_chip chip
+create_bank chip bank
+set_num_lines chip bank 99999
+echo 1 > $CONFIGFS_DIR/chip/live 2> /dev/null && fail "Probe error was not propagated"
+remove_chip chip
+
+echo "2.12. Cannot enable a chip without any GPIO banks"
+create_chip chip
+echo 1 > $CONFIGFS_DIR/chip/live 2> /dev/null && fail "Chip enabled without any GPIO banks"
+remove_chip chip
+
+echo "2.13. Duplicate chip labels are not allowed"
+create_chip chip
+create_bank chip bank0
+set_label chip bank0 foobar
+create_bank chip bank1
+set_label chip bank1 foobar
+echo 1 > $CONFIGFS_DIR/chip/live 2> /dev/null && fail "Duplicate chip labels were not rejected"
+remove_chip chip
+
+echo "2.14. Lines can be hogged"
+create_chip chip
+create_bank chip bank
+set_num_lines chip bank 8
+mkdir -p $CONFIGFS_DIR/chip/bank/line4/hog
+enable_chip chip
+$BASE_DIR/gpio-mockup-cdev -s 1 /dev/`configfs_chip_name chip bank` 4 2> /dev/null && \
+	fail "Setting the value of a hogged line shouldn't succeed"
+remove_chip chip
+
+echo "3. Controlling simulated chips"
+
+echo "3.1. Pull can be set over sysfs"
+create_chip chip
+create_bank chip bank
+set_num_lines chip bank 8
+enable_chip chip
+sysfs_set_pull chip bank 0 pull-up
+$BASE_DIR/gpio-mockup-cdev /dev/`configfs_chip_name chip bank` 0
+test "$?" = "1" || fail "pull set incorrectly"
+sysfs_set_pull chip bank 0 pull-down
+$BASE_DIR/gpio-mockup-cdev /dev/`configfs_chip_name chip bank` 1
+test "$?" = "0" || fail "pull set incorrectly"
+remove_chip chip
+
+echo "3.2. Pull can be read from sysfs"
+create_chip chip
+create_bank chip bank
+set_num_lines chip bank 8
+enable_chip chip
+DEVNAME=`configfs_dev_name chip`
+CHIPNAME=`configfs_chip_name chip bank`
+SYSFS_PATH=/sys/devices/platform/$DEVNAME/$CHIPNAME/sim_gpio0/pull
+test `cat $SYSFS_PATH` = "pull-down" || fail "reading the pull failed"
+sysfs_set_pull chip bank 0 pull-up
+test `cat $SYSFS_PATH` = "pull-up" || fail "reading the pull failed"
+remove_chip chip
+
+echo "3.3. Incorrect input in sysfs is rejected"
+create_chip chip
+create_bank chip bank
+set_num_lines chip bank 8
+enable_chip chip
+DEVNAME=`configfs_dev_name chip`
+CHIPNAME=`configfs_chip_name chip bank`
+SYSFS_PATH="/sys/devices/platform/$DEVNAME/$CHIPNAME/sim_gpio0/pull"
+echo foobar > $SYSFS_PATH 2> /dev/null && fail "invalid input not detected"
+remove_chip chip
+
+echo "3.4. Can't write to value"
+create_chip chip
+create_bank chip bank
+enable_chip chip
+DEVNAME=`configfs_dev_name chip`
+CHIPNAME=`configfs_chip_name chip bank`
+SYSFS_PATH="/sys/devices/platform/$DEVNAME/$CHIPNAME/sim_gpio0/value"
+echo 1 > $SYSFS_PATH 2> /dev/null && fail "writing to 'value' succeeded unexpectedly"
+remove_chip chip
+
+echo "4. Simulated GPIO chips are functional"
+
+echo "4.1. Values can be read from sysfs"
+create_chip chip
+create_bank chip bank
+set_num_lines chip bank 8
+enable_chip chip
+DEVNAME=`configfs_dev_name chip`
+CHIPNAME=`configfs_chip_name chip bank`
+SYSFS_PATH="/sys/devices/platform/$DEVNAME/$CHIPNAME/sim_gpio0/value"
+test `cat $SYSFS_PATH` = "0" || fail "incorrect value read from sysfs"
+$BASE_DIR/gpio-mockup-cdev -s 1 /dev/`configfs_chip_name chip bank` 0 &
+sleep 0.1 # FIXME Any better way?
+test `cat $SYSFS_PATH` = "1" || fail "incorrect value read from sysfs"
+kill $!
+remove_chip chip
+
+echo "4.2. Bias settings work correctly"
+create_chip chip
+create_bank chip bank
+set_num_lines chip bank 8
+enable_chip chip
+$BASE_DIR/gpio-mockup-cdev -b pull-up /dev/`configfs_chip_name chip bank` 0
+test `cat $SYSFS_PATH` = "1" || fail "bias setting does not work"
+remove_chip chip
+
+echo "GPIO $MODULE test PASS"
-- 
cgit v1.2.3


From f7eda6fe03226b8a608a5d52a1e8273a14fba07c Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <brgl@bgdev.pl>
Date: Mon, 13 Dec 2021 17:23:55 +0100
Subject: selftests: gpio: gpio-sim: remove bashisms

'==' is a bashisms and not understood by POSIX shell. Drop it from
gpio-sim selftests.

Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
---
 tools/testing/selftests/gpio/gpio-sim.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/gpio/gpio-sim.sh b/tools/testing/selftests/gpio/gpio-sim.sh
index d335a975890c..c913d5aec768 100755
--- a/tools/testing/selftests/gpio/gpio-sim.sh
+++ b/tools/testing/selftests/gpio/gpio-sim.sh
@@ -23,12 +23,12 @@ remove_chip() {
 
 	for FILE in $CONFIGFS_DIR/$CHIP/*; do
 		BANK=`basename $FILE`
-		if [ "$BANK" == "live" ] || [ "$BANK" == "dev_name" ]; then
+		if [ "$BANK" = "live" ] || [ "$BANK" = "dev_name" ]; then
 			continue
 		fi
 
 		LINES=`ls $CONFIGFS_DIR/$CHIP/$BANK/ | egrep ^line`
-		if [ "$?" == 0 ]; then
+		if [ "$?" = 0 ]; then
 			for LINE in $LINES; do
 				if [ -e $CONFIGFS_DIR/$CHIP/$BANK/$LINE/hog ]; then
 					rmdir $CONFIGFS_DIR/$CHIP/$BANK/$LINE/hog || \
-- 
cgit v1.2.3


From 36ccddf80e56b8c51604bafd449522a5271bfd35 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <brgl@bgdev.pl>
Date: Tue, 14 Dec 2021 09:06:51 +0100
Subject: selftests: gpio: gpio-sim: avoid forking test twice

Use '-o' within [] in order to avoid spawning two processes for test.

Suggested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 tools/testing/selftests/gpio/gpio-sim.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/gpio/gpio-sim.sh b/tools/testing/selftests/gpio/gpio-sim.sh
index c913d5aec768..341e3de00896 100755
--- a/tools/testing/selftests/gpio/gpio-sim.sh
+++ b/tools/testing/selftests/gpio/gpio-sim.sh
@@ -23,7 +23,7 @@ remove_chip() {
 
 	for FILE in $CONFIGFS_DIR/$CHIP/*; do
 		BANK=`basename $FILE`
-		if [ "$BANK" = "live" ] || [ "$BANK" = "dev_name" ]; then
+		if [ "$BANK" = "live" -o "$BANK" = "dev_name" ]; then
 			continue
 		fi
 
-- 
cgit v1.2.3


From 572a0a647b9b491729d24c083c8410c55bf16326 Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko@kernel.org>
Date: Sat, 4 Dec 2021 22:23:55 +0200
Subject: selftests/sgx: Fix corrupted cpuid macro invocation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The SGX selftest fails to build on tip/x86/sgx:

	main.c: In function ‘get_total_epc_mem’:
	main.c:296:17: error: implicit declaration of function ‘__cpuid’ [-Werror=implicit-function-declaration]
	  296 |                 __cpuid(&eax, &ebx, &ecx, &edx);
	      |                 ^~~~~~~

Include cpuid.h and use __cpuid_count() macro in order to fix the
compilation issue.

[ dhansen: tweak commit message ]

Fixes: f0ff2447b861 ("selftests/sgx: Add a new kselftest: Unclobbered_vdso_oversubscribed")
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://lkml.kernel.org/r/20211204202355.23005-1-jarkko@kernel.org
Cc: Shuah Khan <shuah@kernel.org>
---
 tools/testing/selftests/sgx/main.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c
index 7e912db4c6c5..370c4995f7c4 100644
--- a/tools/testing/selftests/sgx/main.c
+++ b/tools/testing/selftests/sgx/main.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*  Copyright(c) 2016-20 Intel Corporation. */
 
+#include <cpuid.h>
 #include <elf.h>
 #include <errno.h>
 #include <fcntl.h>
@@ -291,9 +292,7 @@ static unsigned long get_total_epc_mem(void)
 	int section = 0;
 
 	while (true) {
-		eax = SGX_CPUID;
-		ecx = section + SGX_CPUID_EPC;
-		__cpuid(&eax, &ebx, &ecx, &edx);
+		__cpuid_count(SGX_CPUID, section + SGX_CPUID_EPC, eax, ebx, ecx, edx);
 
 		type = eax & SGX_CPUID_EPC_MASK;
 		if (type == SGX_CPUID_EPC_INVALID)
-- 
cgit v1.2.3


From 878d8def0603eebf11e19903e7a8886b3e9728e4 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 17 Dec 2021 09:12:00 -0800
Subject: libbpf: Rework feature-probing APIs

Create three extensible alternatives to inconsistently named
feature-probing APIs:

  - libbpf_probe_bpf_prog_type() instead of bpf_probe_prog_type();
  - libbpf_probe_bpf_map_type() instead of bpf_probe_map_type();
  - libbpf_probe_bpf_helper() instead of bpf_probe_helper().

Set up return values such that libbpf can report errors (e.g., if some
combination of input arguments isn't possible to validate, etc), in
addition to whether the feature is supported (return value 1) or not
supported (return value 0).

Also schedule deprecation of those three APIs. Also schedule deprecation
of bpf_probe_large_insn_limit().

Also fix all the existing detection logic for various program and map
types that never worked:

  - BPF_PROG_TYPE_LIRC_MODE2;
  - BPF_PROG_TYPE_TRACING;
  - BPF_PROG_TYPE_LSM;
  - BPF_PROG_TYPE_EXT;
  - BPF_PROG_TYPE_SYSCALL;
  - BPF_PROG_TYPE_STRUCT_OPS;
  - BPF_MAP_TYPE_STRUCT_OPS;
  - BPF_MAP_TYPE_BLOOM_FILTER.

Above prog/map types needed special setups and detection logic to work.
Subsequent patch adds selftests that will make sure that all the
detection logic keeps working for all current and future program and map
types, avoiding otherwise inevitable bit rot.

  [0] Closes: https://github.com/libbpf/libbpf/issues/312

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Dave Marchevsky <davemarchevsky@fb.com>
Cc: Julia Kartseva <hex@fb.com>
Link: https://lore.kernel.org/bpf/20211217171202.3352835-2-andrii@kernel.org
---
 tools/lib/bpf/libbpf.h        |  52 +++++++++-
 tools/lib/bpf/libbpf.map      |   3 +
 tools/lib/bpf/libbpf_probes.c | 235 +++++++++++++++++++++++++++++++++---------
 3 files changed, 236 insertions(+), 54 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 42b2f36fd9f0..85dfef88b3d2 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -1052,13 +1052,57 @@ bpf_prog_linfo__lfind(const struct bpf_prog_linfo *prog_linfo,
  * user, causing subsequent probes to fail. In this case, the caller may want
  * to adjust that limit with setrlimit().
  */
-LIBBPF_API bool bpf_probe_prog_type(enum bpf_prog_type prog_type,
-				    __u32 ifindex);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use libbpf_probe_bpf_prog_type() instead")
+LIBBPF_API bool bpf_probe_prog_type(enum bpf_prog_type prog_type, __u32 ifindex);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use libbpf_probe_bpf_map_type() instead")
 LIBBPF_API bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex);
-LIBBPF_API bool bpf_probe_helper(enum bpf_func_id id,
-				 enum bpf_prog_type prog_type, __u32 ifindex);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use libbpf_probe_bpf_helper() instead")
+LIBBPF_API bool bpf_probe_helper(enum bpf_func_id id, enum bpf_prog_type prog_type, __u32 ifindex);
+LIBBPF_DEPRECATED_SINCE(0, 8, "implement your own or use bpftool for feature detection")
 LIBBPF_API bool bpf_probe_large_insn_limit(__u32 ifindex);
 
+/**
+ * @brief **libbpf_probe_bpf_prog_type()** detects if host kernel supports
+ * BPF programs of a given type.
+ * @param prog_type BPF program type to detect kernel support for
+ * @param opts reserved for future extensibility, should be NULL
+ * @return 1, if given program type is supported; 0, if given program type is
+ * not supported; negative error code if feature detection failed or can't be
+ * performed
+ *
+ * Make sure the process has required set of CAP_* permissions (or runs as
+ * root) when performing feature checking.
+ */
+LIBBPF_API int libbpf_probe_bpf_prog_type(enum bpf_prog_type prog_type, const void *opts);
+/**
+ * @brief **libbpf_probe_bpf_map_type()** detects if host kernel supports
+ * BPF maps of a given type.
+ * @param map_type BPF map type to detect kernel support for
+ * @param opts reserved for future extensibility, should be NULL
+ * @return 1, if given map type is supported; 0, if given map type is
+ * not supported; negative error code if feature detection failed or can't be
+ * performed
+ *
+ * Make sure the process has required set of CAP_* permissions (or runs as
+ * root) when performing feature checking.
+ */
+LIBBPF_API int libbpf_probe_bpf_map_type(enum bpf_map_type map_type, const void *opts);
+/**
+ * @brief **libbpf_probe_bpf_helper()** detects if host kernel supports the
+ * use of a given BPF helper from specified BPF program type.
+ * @param prog_type BPF program type used to check the support of BPF helper
+ * @param helper_id BPF helper ID (enum bpf_func_id) to check support for
+ * @param opts reserved for future extensibility, should be NULL
+ * @return 1, if given combination of program type and helper is supported; 0,
+ * if the combination is not supported; negative error code if feature
+ * detection for provided input arguments failed or can't be performed
+ *
+ * Make sure the process has required set of CAP_* permissions (or runs as
+ * root) when performing feature checking.
+ */
+LIBBPF_API int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type,
+				       enum bpf_func_id helper_id, const void *opts);
+
 /*
  * Get bpf_prog_info in continuous memory
  *
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index b3938b3f8fc9..529783967793 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -427,5 +427,8 @@ LIBBPF_0.7.0 {
 		bpf_program__log_level;
 		bpf_program__set_log_buf;
 		bpf_program__set_log_level;
+		libbpf_probe_bpf_helper;
+		libbpf_probe_bpf_map_type;
+		libbpf_probe_bpf_prog_type;
 		libbpf_set_memlock_rlim_max;
 };
diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
index 4bdec69523a7..9d3985283dfc 100644
--- a/tools/lib/bpf/libbpf_probes.c
+++ b/tools/lib/bpf/libbpf_probes.c
@@ -64,12 +64,20 @@ static int get_kernel_version(void)
 	return (version << 16) + (subversion << 8) + patchlevel;
 }
 
-static void
-probe_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns,
-	   size_t insns_cnt, char *buf, size_t buf_len, __u32 ifindex)
+static int probe_prog_load(enum bpf_prog_type prog_type,
+			   const struct bpf_insn *insns, size_t insns_cnt,
+			   char *log_buf, size_t log_buf_sz,
+			   __u32 ifindex)
 {
-	LIBBPF_OPTS(bpf_prog_load_opts, opts);
-	int fd;
+	LIBBPF_OPTS(bpf_prog_load_opts, opts,
+		.log_buf = log_buf,
+		.log_size = log_buf_sz,
+		.log_level = log_buf ? 1 : 0,
+		.prog_ifindex = ifindex,
+	);
+	int fd, err, exp_err = 0;
+	const char *exp_msg = NULL;
+	char buf[4096];
 
 	switch (prog_type) {
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
@@ -84,6 +92,38 @@ probe_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns,
 	case BPF_PROG_TYPE_KPROBE:
 		opts.kern_version = get_kernel_version();
 		break;
+	case BPF_PROG_TYPE_LIRC_MODE2:
+		opts.expected_attach_type = BPF_LIRC_MODE2;
+		break;
+	case BPF_PROG_TYPE_TRACING:
+	case BPF_PROG_TYPE_LSM:
+		opts.log_buf = buf;
+		opts.log_size = sizeof(buf);
+		opts.log_level = 1;
+		if (prog_type == BPF_PROG_TYPE_TRACING)
+			opts.expected_attach_type = BPF_TRACE_FENTRY;
+		else
+			opts.expected_attach_type = BPF_MODIFY_RETURN;
+		opts.attach_btf_id = 1;
+
+		exp_err = -EINVAL;
+		exp_msg = "attach_btf_id 1 is not a function";
+		break;
+	case BPF_PROG_TYPE_EXT:
+		opts.log_buf = buf;
+		opts.log_size = sizeof(buf);
+		opts.log_level = 1;
+		opts.attach_btf_id = 1;
+
+		exp_err = -EINVAL;
+		exp_msg = "Cannot replace kernel functions";
+		break;
+	case BPF_PROG_TYPE_SYSCALL:
+		opts.prog_flags = BPF_F_SLEEPABLE;
+		break;
+	case BPF_PROG_TYPE_STRUCT_OPS:
+		exp_err = -524; /* -ENOTSUPP */
+		break;
 	case BPF_PROG_TYPE_UNSPEC:
 	case BPF_PROG_TYPE_SOCKET_FILTER:
 	case BPF_PROG_TYPE_SCHED_CLS:
@@ -103,25 +143,42 @@ probe_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns,
 	case BPF_PROG_TYPE_RAW_TRACEPOINT:
 	case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
 	case BPF_PROG_TYPE_LWT_SEG6LOCAL:
-	case BPF_PROG_TYPE_LIRC_MODE2:
 	case BPF_PROG_TYPE_SK_REUSEPORT:
 	case BPF_PROG_TYPE_FLOW_DISSECTOR:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
-	case BPF_PROG_TYPE_TRACING:
-	case BPF_PROG_TYPE_STRUCT_OPS:
-	case BPF_PROG_TYPE_EXT:
-	case BPF_PROG_TYPE_LSM:
-	default:
 		break;
+	default:
+		return -EOPNOTSUPP;
 	}
 
-	opts.prog_ifindex = ifindex;
-	opts.log_buf = buf;
-	opts.log_size = buf_len;
-
-	fd = bpf_prog_load(prog_type, NULL, "GPL", insns, insns_cnt, NULL);
+	fd = bpf_prog_load(prog_type, NULL, "GPL", insns, insns_cnt, &opts);
+	err = -errno;
 	if (fd >= 0)
 		close(fd);
+	if (exp_err) {
+		if (fd >= 0 || err != exp_err)
+			return 0;
+		if (exp_msg && !strstr(buf, exp_msg))
+			return 0;
+		return 1;
+	}
+	return fd >= 0 ? 1 : 0;
+}
+
+int libbpf_probe_bpf_prog_type(enum bpf_prog_type prog_type, const void *opts)
+{
+	struct bpf_insn insns[] = {
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN()
+	};
+	const size_t insn_cnt = ARRAY_SIZE(insns);
+	int ret;
+
+	if (opts)
+		return libbpf_err(-EINVAL);
+
+	ret = probe_prog_load(prog_type, insns, insn_cnt, NULL, 0, 0);
+	return libbpf_err(ret);
 }
 
 bool bpf_probe_prog_type(enum bpf_prog_type prog_type, __u32 ifindex)
@@ -131,12 +188,16 @@ bool bpf_probe_prog_type(enum bpf_prog_type prog_type, __u32 ifindex)
 		BPF_EXIT_INSN()
 	};
 
+	/* prefer libbpf_probe_bpf_prog_type() unless offload is requested */
+	if (ifindex == 0)
+		return libbpf_probe_bpf_prog_type(prog_type, NULL) == 1;
+
 	if (ifindex && prog_type == BPF_PROG_TYPE_SCHED_CLS)
 		/* nfp returns -EINVAL on exit(0) with TC offload */
 		insns[0].imm = 2;
 
 	errno = 0;
-	probe_load(prog_type, insns, ARRAY_SIZE(insns), NULL, 0, ifindex);
+	probe_prog_load(prog_type, insns, ARRAY_SIZE(insns), NULL, 0, ifindex);
 
 	return errno != EINVAL && errno != EOPNOTSUPP;
 }
@@ -197,16 +258,18 @@ static int load_local_storage_btf(void)
 				     strs, sizeof(strs));
 }
 
-bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex)
+static int probe_map_create(enum bpf_map_type map_type, __u32 ifindex)
 {
-	int key_size, value_size, max_entries, map_flags;
+	LIBBPF_OPTS(bpf_map_create_opts, opts);
+	int key_size, value_size, max_entries;
 	__u32 btf_key_type_id = 0, btf_value_type_id = 0;
-	int fd = -1, btf_fd = -1, fd_inner;
+	int fd = -1, btf_fd = -1, fd_inner = -1, exp_err = 0, err;
+
+	opts.map_ifindex = ifindex;
 
 	key_size	= sizeof(__u32);
 	value_size	= sizeof(__u32);
 	max_entries	= 1;
-	map_flags	= 0;
 
 	switch (map_type) {
 	case BPF_MAP_TYPE_STACK_TRACE:
@@ -215,7 +278,7 @@ bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex)
 	case BPF_MAP_TYPE_LPM_TRIE:
 		key_size	= sizeof(__u64);
 		value_size	= sizeof(__u64);
-		map_flags	= BPF_F_NO_PREALLOC;
+		opts.map_flags	= BPF_F_NO_PREALLOC;
 		break;
 	case BPF_MAP_TYPE_CGROUP_STORAGE:
 	case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
@@ -234,17 +297,25 @@ bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex)
 		btf_value_type_id = 3;
 		value_size = 8;
 		max_entries = 0;
-		map_flags = BPF_F_NO_PREALLOC;
+		opts.map_flags = BPF_F_NO_PREALLOC;
 		btf_fd = load_local_storage_btf();
 		if (btf_fd < 0)
-			return false;
+			return btf_fd;
 		break;
 	case BPF_MAP_TYPE_RINGBUF:
 		key_size = 0;
 		value_size = 0;
 		max_entries = 4096;
 		break;
-	case BPF_MAP_TYPE_UNSPEC:
+	case BPF_MAP_TYPE_STRUCT_OPS:
+		/* we'll get -ENOTSUPP for invalid BTF type ID for struct_ops */
+		opts.btf_vmlinux_value_type_id = 1;
+		exp_err = -524; /* -ENOTSUPP */
+		break;
+	case BPF_MAP_TYPE_BLOOM_FILTER:
+		key_size = 0;
+		max_entries = 1;
+		break;
 	case BPF_MAP_TYPE_HASH:
 	case BPF_MAP_TYPE_ARRAY:
 	case BPF_MAP_TYPE_PROG_ARRAY:
@@ -263,49 +334,114 @@ bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex)
 	case BPF_MAP_TYPE_XSKMAP:
 	case BPF_MAP_TYPE_SOCKHASH:
 	case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
-	case BPF_MAP_TYPE_STRUCT_OPS:
-	default:
 		break;
+	case BPF_MAP_TYPE_UNSPEC:
+	default:
+		return -EOPNOTSUPP;
 	}
 
 	if (map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
 	    map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
-		LIBBPF_OPTS(bpf_map_create_opts, opts);
-
 		/* TODO: probe for device, once libbpf has a function to create
 		 * map-in-map for offload
 		 */
 		if (ifindex)
-			return false;
+			goto cleanup;
 
 		fd_inner = bpf_map_create(BPF_MAP_TYPE_HASH, NULL,
 					  sizeof(__u32), sizeof(__u32), 1, NULL);
 		if (fd_inner < 0)
-			return false;
+			goto cleanup;
 
 		opts.inner_map_fd = fd_inner;
-		fd = bpf_map_create(map_type, NULL, sizeof(__u32), sizeof(__u32), 1, &opts);
-		close(fd_inner);
-	} else {
-		LIBBPF_OPTS(bpf_map_create_opts, opts);
-
-		/* Note: No other restriction on map type probes for offload */
-		opts.map_flags = map_flags;
-		opts.map_ifindex = ifindex;
-		if (btf_fd >= 0) {
-			opts.btf_fd = btf_fd;
-			opts.btf_key_type_id = btf_key_type_id;
-			opts.btf_value_type_id = btf_value_type_id;
-		}
+	}
 
-		fd = bpf_map_create(map_type, NULL, key_size, value_size, max_entries, &opts);
+	if (btf_fd >= 0) {
+		opts.btf_fd = btf_fd;
+		opts.btf_key_type_id = btf_key_type_id;
+		opts.btf_value_type_id = btf_value_type_id;
 	}
+
+	fd = bpf_map_create(map_type, NULL, key_size, value_size, max_entries, &opts);
+	err = -errno;
+
+cleanup:
 	if (fd >= 0)
 		close(fd);
+	if (fd_inner >= 0)
+		close(fd_inner);
 	if (btf_fd >= 0)
 		close(btf_fd);
 
-	return fd >= 0;
+	if (exp_err)
+		return fd < 0 && err == exp_err ? 1 : 0;
+	else
+		return fd >= 0 ? 1 : 0;
+}
+
+int libbpf_probe_bpf_map_type(enum bpf_map_type map_type, const void *opts)
+{
+	int ret;
+
+	if (opts)
+		return libbpf_err(-EINVAL);
+
+	ret = probe_map_create(map_type, 0);
+	return libbpf_err(ret);
+}
+
+bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex)
+{
+	return probe_map_create(map_type, ifindex) == 1;
+}
+
+int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helper_id,
+			    const void *opts)
+{
+	struct bpf_insn insns[] = {
+		BPF_EMIT_CALL((__u32)helper_id),
+		BPF_EXIT_INSN(),
+	};
+	const size_t insn_cnt = ARRAY_SIZE(insns);
+	char buf[4096];
+	int ret;
+
+	if (opts)
+		return libbpf_err(-EINVAL);
+
+	/* we can't successfully load all prog types to check for BPF helper
+	 * support, so bail out with -EOPNOTSUPP error
+	 */
+	switch (prog_type) {
+	case BPF_PROG_TYPE_TRACING:
+	case BPF_PROG_TYPE_EXT:
+	case BPF_PROG_TYPE_LSM:
+	case BPF_PROG_TYPE_STRUCT_OPS:
+		return -EOPNOTSUPP;
+	default:
+		break;
+	}
+
+	buf[0] = '\0';
+	ret = probe_prog_load(prog_type, insns, insn_cnt, buf, sizeof(buf), 0);
+	if (ret < 0)
+		return libbpf_err(ret);
+
+	/* If BPF verifier doesn't recognize BPF helper ID (enum bpf_func_id)
+	 * at all, it will emit something like "invalid func unknown#181".
+	 * If BPF verifier recognizes BPF helper but it's not supported for
+	 * given BPF program type, it will emit "unknown func bpf_sys_bpf#166".
+	 * In both cases, provided combination of BPF program type and BPF
+	 * helper is not supported by the kernel.
+	 * In all other cases, probe_prog_load() above will either succeed (e.g.,
+	 * because BPF helper happens to accept no input arguments or it
+	 * accepts one input argument and initial PTR_TO_CTX is fine for
+	 * that), or we'll get some more specific BPF verifier error about
+	 * some unsatisfied conditions.
+	 */
+	if (ret == 0 && (strstr(buf, "invalid func ") || strstr(buf, "unknown func ")))
+		return 0;
+	return 1; /* assume supported */
 }
 
 bool bpf_probe_helper(enum bpf_func_id id, enum bpf_prog_type prog_type,
@@ -318,8 +454,7 @@ bool bpf_probe_helper(enum bpf_func_id id, enum bpf_prog_type prog_type,
 	char buf[4096] = {};
 	bool res;
 
-	probe_load(prog_type, insns, ARRAY_SIZE(insns), buf, sizeof(buf),
-		   ifindex);
+	probe_prog_load(prog_type, insns, ARRAY_SIZE(insns), buf, sizeof(buf), ifindex);
 	res = !grep(buf, "invalid func ") && !grep(buf, "unknown func ");
 
 	if (ifindex) {
@@ -351,8 +486,8 @@ bool bpf_probe_large_insn_limit(__u32 ifindex)
 	insns[BPF_MAXINSNS] = BPF_EXIT_INSN();
 
 	errno = 0;
-	probe_load(BPF_PROG_TYPE_SCHED_CLS, insns, ARRAY_SIZE(insns), NULL, 0,
-		   ifindex);
+	probe_prog_load(BPF_PROG_TYPE_SCHED_CLS, insns, ARRAY_SIZE(insns), NULL, 0,
+			ifindex);
 
 	return errno != E2BIG && errno != EINVAL;
 }
-- 
cgit v1.2.3


From 5a8ea82f9d25e88e502d1c3a1a9ba639f69a63c0 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 17 Dec 2021 09:12:01 -0800
Subject: selftests/bpf: Add libbpf feature-probing API selftests

Add selftests for prog/map/prog+helper feature probing APIs. Prog and
map selftests are designed in such a way that they will always test all
the possible prog/map types, based on running kernel's vmlinux BTF enum
definition. This way we'll always be sure that when adding new BPF
program types or map types, libbpf will be always updated accordingly to
be able to feature-detect them.

BPF prog_helper selftest will have to be manually extended with
interesting and important prog+helper combinations, it's easy, but can't
be completely automated.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/bpf/20211217171202.3352835-3-andrii@kernel.org
---
 tools/testing/selftests/bpf/config                 |   2 +
 .../selftests/bpf/prog_tests/libbpf_probes.c       | 124 +++++++++++++++++++++
 2 files changed, 126 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/libbpf_probes.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index 5192305159ec..f6287132fa89 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -38,7 +38,9 @@ CONFIG_IPV6_SIT=m
 CONFIG_BPF_JIT=y
 CONFIG_BPF_LSM=y
 CONFIG_SECURITY=y
+CONFIG_RC_CORE=y
 CONFIG_LIRC=y
+CONFIG_BPF_LIRC_MODE2=y
 CONFIG_IMA=y
 CONFIG_SECURITYFS=y
 CONFIG_IMA_WRITE_POLICY=y
diff --git a/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c b/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c
new file mode 100644
index 000000000000..9f766ddd946a
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c
@@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2021 Facebook */
+
+#include <test_progs.h>
+#include <bpf/btf.h>
+
+void test_libbpf_probe_prog_types(void)
+{
+	struct btf *btf;
+	const struct btf_type *t;
+	const struct btf_enum *e;
+	int i, n, id;
+
+	btf = btf__parse("/sys/kernel/btf/vmlinux", NULL);
+	if (!ASSERT_OK_PTR(btf, "btf_parse"))
+		return;
+
+	/* find enum bpf_prog_type and enumerate each value */
+	id = btf__find_by_name_kind(btf, "bpf_prog_type", BTF_KIND_ENUM);
+	if (!ASSERT_GT(id, 0, "bpf_prog_type_id"))
+		goto cleanup;
+	t = btf__type_by_id(btf, id);
+	if (!ASSERT_OK_PTR(t, "bpf_prog_type_enum"))
+		goto cleanup;
+
+	for (e = btf_enum(t), i = 0, n = btf_vlen(t); i < n; e++, i++) {
+		const char *prog_type_name = btf__str_by_offset(btf, e->name_off);
+		enum bpf_prog_type prog_type = (enum bpf_prog_type)e->val;
+		int res;
+
+		if (prog_type == BPF_PROG_TYPE_UNSPEC)
+			continue;
+
+		if (!test__start_subtest(prog_type_name))
+			continue;
+
+		res = libbpf_probe_bpf_prog_type(prog_type, NULL);
+		ASSERT_EQ(res, 1, prog_type_name);
+	}
+
+cleanup:
+	btf__free(btf);
+}
+
+void test_libbpf_probe_map_types(void)
+{
+	struct btf *btf;
+	const struct btf_type *t;
+	const struct btf_enum *e;
+	int i, n, id;
+
+	btf = btf__parse("/sys/kernel/btf/vmlinux", NULL);
+	if (!ASSERT_OK_PTR(btf, "btf_parse"))
+		return;
+
+	/* find enum bpf_map_type and enumerate each value */
+	id = btf__find_by_name_kind(btf, "bpf_map_type", BTF_KIND_ENUM);
+	if (!ASSERT_GT(id, 0, "bpf_map_type_id"))
+		goto cleanup;
+	t = btf__type_by_id(btf, id);
+	if (!ASSERT_OK_PTR(t, "bpf_map_type_enum"))
+		goto cleanup;
+
+	for (e = btf_enum(t), i = 0, n = btf_vlen(t); i < n; e++, i++) {
+		const char *map_type_name = btf__str_by_offset(btf, e->name_off);
+		enum bpf_map_type map_type = (enum bpf_map_type)e->val;
+		int res;
+
+		if (map_type == BPF_MAP_TYPE_UNSPEC)
+			continue;
+
+		if (!test__start_subtest(map_type_name))
+			continue;
+
+		res = libbpf_probe_bpf_map_type(map_type, NULL);
+		ASSERT_EQ(res, 1, map_type_name);
+	}
+
+cleanup:
+	btf__free(btf);
+}
+
+void test_libbpf_probe_helpers(void)
+{
+#define CASE(prog, helper, supp) {			\
+	.prog_type_name = "BPF_PROG_TYPE_" # prog,	\
+	.helper_name = "bpf_" # helper,			\
+	.prog_type = BPF_PROG_TYPE_ ## prog,		\
+	.helper_id = BPF_FUNC_ ## helper,		\
+	.supported = supp,				\
+}
+	const struct case_def {
+		const char *prog_type_name;
+		const char *helper_name;
+		enum bpf_prog_type prog_type;
+		enum bpf_func_id helper_id;
+		bool supported;
+	} cases[] = {
+		CASE(KPROBE, unspec, false),
+		CASE(KPROBE, map_lookup_elem, true),
+		CASE(KPROBE, loop, true),
+
+		CASE(KPROBE, ktime_get_coarse_ns, false),
+		CASE(SOCKET_FILTER, ktime_get_coarse_ns, true),
+
+		CASE(KPROBE, sys_bpf, false),
+		CASE(SYSCALL, sys_bpf, true),
+	};
+	size_t case_cnt = ARRAY_SIZE(cases), i;
+	char buf[128];
+
+	for (i = 0; i < case_cnt; i++) {
+		const struct case_def *d = &cases[i];
+		int res;
+
+		snprintf(buf, sizeof(buf), "%s+%s", d->prog_type_name, d->helper_name);
+
+		if (!test__start_subtest(buf))
+			continue;
+
+		res = libbpf_probe_bpf_helper(d->prog_type, d->helper_id, NULL);
+		ASSERT_EQ(res, d->supported, buf);
+	}
+}
-- 
cgit v1.2.3


From e967a20a8fabc6442a78e2e2059e63a4bb6aed08 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 17 Dec 2021 09:12:02 -0800
Subject: bpftool: Reimplement large insn size limit feature probing

Reimplement bpf_probe_large_insn_limit() in bpftool, as that libbpf API
is scheduled for deprecation in v0.8.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/bpf/20211217171202.3352835-4-andrii@kernel.org
---
 tools/bpf/bpftool/feature.c | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c
index 5397077d0d9e..6719b9282eca 100644
--- a/tools/bpf/bpftool/feature.c
+++ b/tools/bpf/bpftool/feature.c
@@ -642,12 +642,32 @@ probe_helpers_for_progtype(enum bpf_prog_type prog_type, bool supported_type,
 		printf("\n");
 }
 
-static void
-probe_large_insn_limit(const char *define_prefix, __u32 ifindex)
+/*
+ * Probe for availability of kernel commit (5.3):
+ *
+ * c04c0d2b968a ("bpf: increase complexity limit and maximum program size")
+ */
+static void probe_large_insn_limit(const char *define_prefix, __u32 ifindex)
 {
+	LIBBPF_OPTS(bpf_prog_load_opts, opts,
+		.prog_ifindex = ifindex,
+	);
+	struct bpf_insn insns[BPF_MAXINSNS + 1];
 	bool res;
+	int i, fd;
+
+	for (i = 0; i < BPF_MAXINSNS; i++)
+		insns[i] = BPF_MOV64_IMM(BPF_REG_0, 1);
+	insns[BPF_MAXINSNS] = BPF_EXIT_INSN();
+
+	errno = 0;
+	fd = bpf_prog_load(BPF_PROG_TYPE_SCHED_CLS, NULL, "GPL",
+			   insns, ARRAY_SIZE(insns), &opts);
+	res = fd >= 0 || (errno != E2BIG && errno != EINVAL);
+
+	if (fd >= 0)
+		close(fd);
 
-	res = bpf_probe_large_insn_limit(ifindex);
 	print_bool_feature("have_large_insn_limit",
 			   "Large program size limit",
 			   "LARGE_INSN_LIMIT",
-- 
cgit v1.2.3


From 819d11507f6637731947836e6308f5966d64cf9d Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Fri, 17 Dec 2021 18:24:00 +0000
Subject: bpf, selftests: Fix spelling mistake "tained" -> "tainted"

There appears to be a spelling mistake in a bpf test message. Fix it.

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211217182400.39296-1-colin.i.king@gmail.com
---
 tools/testing/selftests/bpf/verifier/value_ptr_arith.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c
index 4d347bc53aa2..359f3e8f8b60 100644
--- a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c
+++ b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c
@@ -1078,7 +1078,7 @@
 	.errstr_unpriv = "R0 pointer -= pointer prohibited",
 },
 {
-	"map access: trying to leak tained dst reg",
+	"map access: trying to leak tainted dst reg",
 	.insns = {
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
-- 
cgit v1.2.3


From ff8752d7617da301ad3b7ef18caa58d135ee8c3c Mon Sep 17 00:00:00 2001
From: German Gomez <german.gomez@arm.com>
Date: Thu, 16 Dec 2021 15:24:04 +0000
Subject: perf arm-spe: Synthesize SPE instruction events

Synthesize instruction events for every ARM SPE record.

Arm SPE implements a hardware-based sample period, and perf implements a
software-based one. Add a warning message to inform the user of this.

Signed-off-by: German Gomez <german.gomez@arm.com>
Tested-by: Leo Yan <leo.yan@linaro.org>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20211216152404.52474-1-german.gomez@arm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/arm-spe.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index 8a3828f86901..d2b64e3f588b 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -58,6 +58,8 @@ struct arm_spe {
 	u8				sample_branch;
 	u8				sample_remote_access;
 	u8				sample_memory;
+	u8				sample_instructions;
+	u64				instructions_sample_period;
 
 	u64				l1d_miss_id;
 	u64				l1d_access_id;
@@ -68,6 +70,7 @@ struct arm_spe {
 	u64				branch_miss_id;
 	u64				remote_access_id;
 	u64				memory_id;
+	u64				instructions_id;
 
 	u64				kernel_start;
 
@@ -90,6 +93,7 @@ struct arm_spe_queue {
 	u64				time;
 	u64				timestamp;
 	struct thread			*thread;
+	u64				period_instructions;
 };
 
 static void arm_spe_dump(struct arm_spe *spe __maybe_unused,
@@ -202,6 +206,7 @@ static struct arm_spe_queue *arm_spe__alloc_queue(struct arm_spe *spe,
 	speq->pid = -1;
 	speq->tid = -1;
 	speq->cpu = -1;
+	speq->period_instructions = 0;
 
 	/* params set */
 	params.get_trace = arm_spe_get_trace;
@@ -353,6 +358,35 @@ static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq,
 	return arm_spe_deliver_synth_event(spe, speq, event, &sample);
 }
 
+static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq,
+					     u64 spe_events_id, u64 data_src)
+{
+	struct arm_spe *spe = speq->spe;
+	struct arm_spe_record *record = &speq->decoder->record;
+	union perf_event *event = speq->event_buf;
+	struct perf_sample sample = { .ip = 0, };
+
+	/*
+	 * Handles perf instruction sampling period.
+	 */
+	speq->period_instructions++;
+	if (speq->period_instructions < spe->instructions_sample_period)
+		return 0;
+	speq->period_instructions = 0;
+
+	arm_spe_prep_sample(spe, speq, event, &sample);
+
+	sample.id = spe_events_id;
+	sample.stream_id = spe_events_id;
+	sample.addr = record->virt_addr;
+	sample.phys_addr = record->phys_addr;
+	sample.data_src = data_src;
+	sample.period = spe->instructions_sample_period;
+	sample.weight = record->latency;
+
+	return arm_spe_deliver_synth_event(spe, speq, event, &sample);
+}
+
 #define SPE_MEM_TYPE	(ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS | \
 			 ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS | \
 			 ARM_SPE_REMOTE_ACCESS)
@@ -482,6 +516,12 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
 			return err;
 	}
 
+	if (spe->sample_instructions) {
+		err = arm_spe__synth_instruction_sample(speq, spe->instructions_id, data_src);
+		if (err)
+			return err;
+	}
+
 	return 0;
 }
 
@@ -1110,7 +1150,29 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
 			return err;
 		spe->memory_id = id;
 		arm_spe_set_event_name(evlist, id, "memory");
+		id += 1;
+	}
+
+	if (spe->synth_opts.instructions) {
+		if (spe->synth_opts.period_type != PERF_ITRACE_PERIOD_INSTRUCTIONS) {
+			pr_warning("Only instruction-based sampling period is currently supported by Arm SPE.\n");
+			goto synth_instructions_out;
+		}
+		if (spe->synth_opts.period > 1)
+			pr_warning("Arm SPE has a hardware-based sample period.\n"
+				   "Additional instruction events will be discarded by --itrace\n");
+
+		spe->sample_instructions = true;
+		attr.config = PERF_COUNT_HW_INSTRUCTIONS;
+		attr.sample_period = spe->synth_opts.period;
+		spe->instructions_sample_period = attr.sample_period;
+		err = arm_spe_synth_event(session, &attr, id);
+		if (err)
+			return err;
+		spe->instructions_id = id;
+		arm_spe_set_event_name(evlist, id, "instructions");
 	}
+synth_instructions_out:
 
 	return 0;
 }
-- 
cgit v1.2.3


From f730b65c9d85c84d6e5fd5f6fee20e2d62ffe558 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 17 Dec 2021 15:37:01 -0800
Subject: selftests: mptcp: try to set mptcp ulp mode in different sk states

The kernel will crash without
'mptcp: clear 'kern' flag from fallback sockets' change.

Since this doesn't slow down testing in a noticeable way,
run this unconditionally.

The explicit test did not catch this, because the check was done
for tcp socket returned by 'socket(.. IPPROTO_TCP) rather than a
tcp socket returned by accept() on a mptcp listen fd.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_connect.c  | 97 ++++++++++++----------
 tools/testing/selftests/net/mptcp/mptcp_connect.sh | 20 -----
 2 files changed, 51 insertions(+), 66 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c
index 98de28ac3ba8..a30e93c5c549 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.c
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c
@@ -59,7 +59,6 @@ static enum cfg_peek cfg_peek = CFG_NONE_PEEK;
 static const char *cfg_host;
 static const char *cfg_port	= "12000";
 static int cfg_sock_proto	= IPPROTO_MPTCP;
-static bool tcpulp_audit;
 static int pf = AF_INET;
 static int cfg_sndbuf;
 static int cfg_rcvbuf;
@@ -103,7 +102,6 @@ static void die_usage(void)
 	fprintf(stderr, "\t-s [MPTCP|TCP] -- use mptcp(default) or tcp sockets\n");
 	fprintf(stderr, "\t-m [poll|mmap|sendfile] -- use poll(default)/mmap+write/sendfile\n");
 	fprintf(stderr, "\t-M mark -- set socket packet mark\n");
-	fprintf(stderr, "\t-u -- check mptcp ulp\n");
 	fprintf(stderr, "\t-w num -- wait num sec before closing the socket\n");
 	fprintf(stderr, "\t-c cmsg -- test cmsg type <cmsg>\n");
 	fprintf(stderr, "\t-o option -- test sockopt <option>\n");
@@ -215,6 +213,42 @@ static void set_transparent(int fd, int pf)
 	}
 }
 
+static int do_ulp_so(int sock, const char *name)
+{
+	return setsockopt(sock, IPPROTO_TCP, TCP_ULP, name, strlen(name));
+}
+
+#define X(m)	xerror("%s:%u: %s: failed for proto %d at line %u", __FILE__, __LINE__, (m), proto, line)
+static void sock_test_tcpulp(int sock, int proto, unsigned int line)
+{
+	socklen_t buflen = 8;
+	char buf[8] = "";
+	int ret = getsockopt(sock, IPPROTO_TCP, TCP_ULP, buf, &buflen);
+
+	if (ret != 0)
+		X("getsockopt");
+
+	if (buflen > 0) {
+		if (strcmp(buf, "mptcp") != 0)
+			xerror("unexpected ULP '%s' for proto %d at line %u", buf, proto, line);
+		ret = do_ulp_so(sock, "tls");
+		if (ret == 0)
+			X("setsockopt");
+	} else if (proto == IPPROTO_MPTCP) {
+		ret = do_ulp_so(sock, "tls");
+		if (ret != -1)
+			X("setsockopt");
+	}
+
+	ret = do_ulp_so(sock, "mptcp");
+	if (ret != -1)
+		X("setsockopt");
+
+#undef X
+}
+
+#define SOCK_TEST_TCPULP(s, p) sock_test_tcpulp((s), (p), __LINE__)
+
 static int sock_listen_mptcp(const char * const listenaddr,
 			     const char * const port)
 {
@@ -238,6 +272,8 @@ static int sock_listen_mptcp(const char * const listenaddr,
 		if (sock < 0)
 			continue;
 
+		SOCK_TEST_TCPULP(sock, cfg_sock_proto);
+
 		if (-1 == setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &one,
 				     sizeof(one)))
 			perror("setsockopt");
@@ -260,50 +296,17 @@ static int sock_listen_mptcp(const char * const listenaddr,
 		return sock;
 	}
 
+	SOCK_TEST_TCPULP(sock, cfg_sock_proto);
+
 	if (listen(sock, 20)) {
 		perror("listen");
 		close(sock);
 		return -1;
 	}
 
-	return sock;
-}
+	SOCK_TEST_TCPULP(sock, cfg_sock_proto);
 
-static bool sock_test_tcpulp(const char * const remoteaddr,
-			     const char * const port)
-{
-	struct addrinfo hints = {
-		.ai_protocol = IPPROTO_TCP,
-		.ai_socktype = SOCK_STREAM,
-	};
-	struct addrinfo *a, *addr;
-	int sock = -1, ret = 0;
-	bool test_pass = false;
-
-	hints.ai_family = AF_INET;
-
-	xgetaddrinfo(remoteaddr, port, &hints, &addr);
-	for (a = addr; a; a = a->ai_next) {
-		sock = socket(a->ai_family, a->ai_socktype, IPPROTO_TCP);
-		if (sock < 0) {
-			perror("socket");
-			continue;
-		}
-		ret = setsockopt(sock, IPPROTO_TCP, TCP_ULP, "mptcp",
-				 sizeof("mptcp"));
-		if (ret == -1 && errno == EOPNOTSUPP)
-			test_pass = true;
-		close(sock);
-
-		if (test_pass)
-			break;
-		if (!ret)
-			fprintf(stderr,
-				"setsockopt(TCP_ULP) returned 0\n");
-		else
-			perror("setsockopt(TCP_ULP)");
-	}
-	return test_pass;
+	return sock;
 }
 
 static int sock_connect_mptcp(const char * const remoteaddr,
@@ -326,6 +329,8 @@ static int sock_connect_mptcp(const char * const remoteaddr,
 			continue;
 		}
 
+		SOCK_TEST_TCPULP(sock, proto);
+
 		if (cfg_mark)
 			set_mark(sock, cfg_mark);
 
@@ -338,6 +343,8 @@ static int sock_connect_mptcp(const char * const remoteaddr,
 	}
 
 	freeaddrinfo(addr);
+	if (sock != -1)
+		SOCK_TEST_TCPULP(sock, proto);
 	return sock;
 }
 
@@ -954,6 +961,8 @@ int main_loop_s(int listensock)
 		check_sockaddr(pf, &ss, salen);
 		check_getpeername(remotesock, &ss, salen);
 
+		SOCK_TEST_TCPULP(remotesock, 0);
+
 		return copyfd_io(0, remotesock, 1);
 	}
 
@@ -1059,6 +1068,8 @@ int main_loop(void)
 
 	check_getpeername_connect(fd);
 
+	SOCK_TEST_TCPULP(fd, cfg_sock_proto);
+
 	if (cfg_rcvbuf)
 		set_rcvbuf(fd, cfg_rcvbuf);
 	if (cfg_sndbuf)
@@ -1151,7 +1162,7 @@ static void parse_opts(int argc, char **argv)
 {
 	int c;
 
-	while ((c = getopt(argc, argv, "6jr:lp:s:hut:T:m:S:R:w:M:P:c:o:")) != -1) {
+	while ((c = getopt(argc, argv, "6jr:lp:s:ht:T:m:S:R:w:M:P:c:o:")) != -1) {
 		switch (c) {
 		case 'j':
 			cfg_join = true;
@@ -1177,9 +1188,6 @@ static void parse_opts(int argc, char **argv)
 		case 'h':
 			die_usage();
 			break;
-		case 'u':
-			tcpulp_audit = true;
-			break;
 		case '6':
 			pf = AF_INET6;
 			break;
@@ -1233,9 +1241,6 @@ int main(int argc, char *argv[])
 	signal(SIGUSR1, handle_signal);
 	parse_opts(argc, argv);
 
-	if (tcpulp_audit)
-		return sock_test_tcpulp(cfg_host, cfg_port) ? 0 : 1;
-
 	if (listen_mode) {
 		int fd = sock_listen_mptcp(cfg_host, cfg_port);
 
diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
index a4226b608c68..ee28f82a6c89 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
@@ -296,24 +296,6 @@ check_mptcp_disabled()
 	return 0
 }
 
-check_mptcp_ulp_setsockopt()
-{
-	local t retval
-	t="ns_ulp-$sech-$(mktemp -u XXXXXX)"
-
-	ip netns add ${t} || exit $ksft_skip
-	if ! ip netns exec ${t} ./mptcp_connect -u -p 10000 -s TCP 127.0.0.1 2>&1; then
-		printf "setsockopt(..., TCP_ULP, \"mptcp\", ...) allowed\t[ FAIL ]\n"
-		retval=1
-		ret=$retval
-	else
-		printf "setsockopt(..., TCP_ULP, \"mptcp\", ...) blocked\t[ OK ]\n"
-		retval=0
-	fi
-	ip netns del ${t}
-	return $retval
-}
-
 # $1: IP address
 is_v6()
 {
@@ -780,8 +762,6 @@ make_file "$sin" "server"
 
 check_mptcp_disabled
 
-check_mptcp_ulp_setsockopt
-
 stop_if_error "The kernel configuration is not valid for MPTCP"
 
 echo "INFO: validating network environment with pings"
-- 
cgit v1.2.3


From 0a515a06c5ebfa46fee3ac519e418f801e718da4 Mon Sep 17 00:00:00 2001
From: Miaoqian Lin <linmq006@gmail.com>
Date: Sun, 12 Dec 2021 06:25:02 +0000
Subject: perf expr: Fix missing check for return value of hashmap__new()

The hashmap__new() function may return ERR_PTR(-ENOMEM) when malloc()
fails, add IS_ERR() checking for ctx->ids.

Signed-off-by: Miaoqian Lin <linmq006@gmail.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lore.kernel.org/lkml/20211212062504.25841-1-linmq006@gmail.com
[ s/kfree()/free()/ and add missing linux/err.h include ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/expr.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/util/expr.c b/tools/perf/util/expr.c
index 1d532b9fed29..254601060b39 100644
--- a/tools/perf/util/expr.c
+++ b/tools/perf/util/expr.c
@@ -12,6 +12,7 @@
 #include "expr-bison.h"
 #include "expr-flex.h"
 #include "smt.h"
+#include <linux/err.h>
 #include <linux/kernel.h>
 #include <linux/zalloc.h>
 #include <ctype.h>
@@ -299,6 +300,10 @@ struct expr_parse_ctx *expr__ctx_new(void)
 		return NULL;
 
 	ctx->ids = hashmap__new(key_hash, key_equal, NULL);
+	if (IS_ERR(ctx->ids)) {
+		free(ctx);
+		return NULL;
+	}
 	ctx->runtime = 0;
 
 	return ctx;
-- 
cgit v1.2.3


From 0c8e32fe48f549eef27c8c6b0a63530f83c3a643 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Mon, 13 Dec 2021 10:48:28 +0200
Subject: perf inject: Fix segfault due to close without open

The fixed commit attempts to close inject.output even if it was never
opened e.g.

  $ perf record uname
  Linux
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.002 MB perf.data (7 samples) ]
  $ perf inject -i perf.data --vm-time-correlation=dry-run
  Segmentation fault (core dumped)
  $ gdb --quiet perf
  Reading symbols from perf...
  (gdb) r inject -i perf.data --vm-time-correlation=dry-run
  Starting program: /home/ahunter/bin/perf inject -i perf.data --vm-time-correlation=dry-run
  [Thread debugging using libthread_db enabled]
  Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".

  Program received signal SIGSEGV, Segmentation fault.
  0x00007eff8afeef5b in _IO_new_fclose (fp=0x0) at iofclose.c:48
  48      iofclose.c: No such file or directory.
  (gdb) bt
  #0  0x00007eff8afeef5b in _IO_new_fclose (fp=0x0) at iofclose.c:48
  #1  0x0000557fc7b74f92 in perf_data__close (data=data@entry=0x7ffcdafa6578) at util/data.c:376
  #2  0x0000557fc7a6b807 in cmd_inject (argc=<optimized out>, argv=<optimized out>) at builtin-inject.c:1085
  #3  0x0000557fc7ac4783 in run_builtin (p=0x557fc8074878 <commands+600>, argc=4, argv=0x7ffcdafb6a60) at perf.c:313
  #4  0x0000557fc7a25d5c in handle_internal_command (argv=<optimized out>, argc=<optimized out>) at perf.c:365
  #5  run_argv (argcp=<optimized out>, argv=<optimized out>) at perf.c:409
  #6  main (argc=4, argv=0x7ffcdafb6a60) at perf.c:539
  (gdb)

Fixes: 02e6246f5364d526 ("perf inject: Close inject.output on exit")
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: stable@vger.kernel.org
Link: http://lore.kernel.org/lkml/20211213084829.114772-2-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-inject.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index b9d6306cc14e..af70f1c72052 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -1078,7 +1078,8 @@ out_delete:
 	zstd_fini(&(inject.session->zstd_data));
 	perf_session__delete(inject.session);
 out_close_output:
-	perf_data__close(&inject.output);
+	if (!inject.in_place_update)
+		perf_data__close(&inject.output);
 	free(inject.itrace_synth_opts.vm_tm_corr_args);
 	return ret;
 }
-- 
cgit v1.2.3


From c271a55b0c6029fed0cac909fa57999a11467132 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Mon, 13 Dec 2021 10:48:29 +0200
Subject: perf inject: Fix segfault due to perf_data__fd() without open

The fixed commit attempts to get the output file descriptor even if the
file was never opened e.g.

  $ perf record uname
  Linux
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.002 MB perf.data (7 samples) ]
  $ perf inject -i perf.data --vm-time-correlation=dry-run
  Segmentation fault (core dumped)
  $ gdb --quiet perf
  Reading symbols from perf...
  (gdb) r inject -i perf.data --vm-time-correlation=dry-run
  Starting program: /home/ahunter/bin/perf inject -i perf.data --vm-time-correlation=dry-run
  [Thread debugging using libthread_db enabled]
  Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".

  Program received signal SIGSEGV, Segmentation fault.
  __GI___fileno (fp=0x0) at fileno.c:35
  35      fileno.c: No such file or directory.
  (gdb) bt
  #0  __GI___fileno (fp=0x0) at fileno.c:35
  #1  0x00005621e48dd987 in perf_data__fd (data=0x7fff4c68bd08) at util/data.h:72
  #2  perf_data__fd (data=0x7fff4c68bd08) at util/data.h:69
  #3  cmd_inject (argc=<optimized out>, argv=0x7fff4c69c1f0) at builtin-inject.c:1017
  #4  0x00005621e4936783 in run_builtin (p=0x5621e4ee6878 <commands+600>, argc=4, argv=0x7fff4c69c1f0) at perf.c:313
  #5  0x00005621e4897d5c in handle_internal_command (argv=<optimized out>, argc=<optimized out>) at perf.c:365
  #6  run_argv (argcp=<optimized out>, argv=<optimized out>) at perf.c:409
  #7  main (argc=4, argv=0x7fff4c69c1f0) at perf.c:539
  (gdb)

Fixes: 0ae03893623dd1dd ("perf tools: Pass a fd to perf_file_header__read_pipe()")
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: stable@vger.kernel.org
Link: http://lore.kernel.org/lkml/20211213084829.114772-3-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-inject.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index af70f1c72052..409b721666cb 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -755,12 +755,16 @@ static int parse_vm_time_correlation(const struct option *opt, const char *str,
 	return inject->itrace_synth_opts.vm_tm_corr_args ? 0 : -ENOMEM;
 }
 
+static int output_fd(struct perf_inject *inject)
+{
+	return inject->in_place_update ? -1 : perf_data__fd(&inject->output);
+}
+
 static int __cmd_inject(struct perf_inject *inject)
 {
 	int ret = -EINVAL;
 	struct perf_session *session = inject->session;
-	struct perf_data *data_out = &inject->output;
-	int fd = inject->in_place_update ? -1 : perf_data__fd(data_out);
+	int fd = output_fd(inject);
 	u64 output_data_offset;
 
 	signal(SIGINT, sig_handler);
@@ -1015,7 +1019,7 @@ int cmd_inject(int argc, const char **argv)
 	}
 
 	inject.session = __perf_session__new(&data, repipe,
-					     perf_data__fd(&inject.output),
+					     output_fd(&inject),
 					     &inject.tool);
 	if (IS_ERR(inject.session)) {
 		ret = PTR_ERR(inject.session);
-- 
cgit v1.2.3


From 9497c458c10b049438ef6e6ddda898edbc3ec6a8 Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Thu, 16 Dec 2021 16:31:52 -0800
Subject: bpf/selftests: Test PTR_TO_RDONLY_MEM

This test verifies that a ksym of non-struct can not be directly
updated.

Signed-off-by: Hao Luo <haoluo@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211217003152.48334-10-haoluo@google.com
---
 tools/testing/selftests/bpf/prog_tests/ksyms_btf.c | 14 +++++++++++
 .../bpf/progs/test_ksyms_btf_write_check.c         | 29 ++++++++++++++++++++++
 2 files changed, 43 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c
index 79f6bd1e50d6..f6933b06daf8 100644
--- a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c
+++ b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c
@@ -8,6 +8,7 @@
 #include "test_ksyms_btf_null_check.skel.h"
 #include "test_ksyms_weak.skel.h"
 #include "test_ksyms_weak.lskel.h"
+#include "test_ksyms_btf_write_check.skel.h"
 
 static int duration;
 
@@ -137,6 +138,16 @@ cleanup:
 	test_ksyms_weak_lskel__destroy(skel);
 }
 
+static void test_write_check(void)
+{
+	struct test_ksyms_btf_write_check *skel;
+
+	skel = test_ksyms_btf_write_check__open_and_load();
+	ASSERT_ERR_PTR(skel, "unexpected load of a prog writing to ksym memory\n");
+
+	test_ksyms_btf_write_check__destroy(skel);
+}
+
 void test_ksyms_btf(void)
 {
 	int percpu_datasec;
@@ -167,4 +178,7 @@ void test_ksyms_btf(void)
 
 	if (test__start_subtest("weak_ksyms_lskel"))
 		test_weak_syms_lskel();
+
+	if (test__start_subtest("write_check"))
+		test_write_check();
 }
diff --git a/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c b/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c
new file mode 100644
index 000000000000..2180c41cd890
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Google */
+
+#include "vmlinux.h"
+
+#include <bpf/bpf_helpers.h>
+
+extern const int bpf_prog_active __ksym; /* int type global var. */
+
+SEC("raw_tp/sys_enter")
+int handler(const void *ctx)
+{
+	int *active;
+	__u32 cpu;
+
+	cpu = bpf_get_smp_processor_id();
+	active = (int *)bpf_per_cpu_ptr(&bpf_prog_active, cpu);
+	if (active) {
+		/* Kernel memory obtained from bpf_{per,this}_cpu_ptr
+		 * is read-only, should _not_ pass verification.
+		 */
+		/* WRITE_ONCE */
+		*(volatile int *)active = -1;
+	}
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From eb473bac4a4b28eafb104e35410de3bc111b2e08 Mon Sep 17 00:00:00 2001
From: Baowen Zheng <baowen.zheng@corigine.com>
Date: Fri, 17 Dec 2021 19:16:29 +0100
Subject: selftests: tc-testing: add action offload selftest for action and
 filter

Add selftest cases in action police with skip_hw.
Add selftest case to validate flags of filter and action.
These tests depend on corresponding iproute2 command support.

Signed-off-by: Baowen Zheng <baowen.zheng@corigine.com>
Signed-off-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../tc-testing/tc-tests/actions/police.json        | 24 ++++++++
 .../tc-testing/tc-tests/filters/matchall.json      | 72 ++++++++++++++++++++++
 2 files changed, 96 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/police.json b/tools/testing/selftests/tc-testing/tc-tests/actions/police.json
index 8e45792703ed..b7205a069534 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/actions/police.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/police.json
@@ -812,5 +812,29 @@
         "teardown": [
             "$TC actions flush action police"
         ]
+    },
+    {
+        "id": "7d64",
+        "name": "Add police action with skip_hw option",
+        "category": [
+            "actions",
+            "police"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action police",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action police rate 1kbit burst 10k index 100 skip_hw",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions ls action police | grep skip_hw",
+        "matchPattern": "skip_hw",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action police"
+        ]
     }
 ]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/matchall.json b/tools/testing/selftests/tc-testing/tc-tests/filters/matchall.json
index 51799874a972..2df68017dfb8 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/filters/matchall.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/filters/matchall.json
@@ -387,5 +387,77 @@
             "$TC qdisc del dev $DUMMY ingress",
             "$IP link del dev $DUMMY type dummy"
         ]
+    },
+    {
+        "id": "3329",
+        "name": "Validate flags of the matchall filter with skip_sw and police action with skip_hw",
+        "category": [
+            "filter",
+            "matchall"
+        ],
+        "setup": [
+            "$IP link add dev $DUMMY type dummy || /bin/true",
+            "$TC qdisc add dev $DUMMY ingress",
+            "$TC actions flush action police",
+            "$TC actions add action police rate 1mbit burst 100k index 199 skip_hw"
+        ],
+        "cmdUnderTest": "$TC filter add dev $DUMMY parent ffff: handle 0x1 prio 1 protocol ipv4 matchall skip_sw action police index 199",
+        "expExitCode": "2",
+        "verifyCmd": "$TC filter get dev $DUMMY parent ffff: handle 1 prio 1 protocol ipv4 matchall",
+        "matchPattern": "^filter parent ffff: protocol ip pref 1 matchall.*handle 0x1.*",
+        "matchCount": "0",
+        "teardown": [
+            "$TC qdisc del dev $DUMMY ingress",
+            "$IP link del dev $DUMMY type dummy",
+            "$TC actions del action police index 199"
+        ]
+    },
+    {
+        "id": "0eeb",
+        "name": "Validate flags of the matchall filter with skip_hw and police action",
+        "category": [
+            "filter",
+            "matchall"
+        ],
+        "setup": [
+            "$IP link add dev $DUMMY type dummy || /bin/true",
+            "$TC qdisc add dev $DUMMY ingress",
+            "$TC actions flush action police",
+            "$TC actions add action police rate 1mbit burst 100k index 199"
+        ],
+        "cmdUnderTest": "$TC filter add dev $DUMMY parent ffff: handle 0x1 prio 1 protocol ipv4 matchall skip_hw action police index 199",
+        "expExitCode": "2",
+        "verifyCmd": "$TC filter get dev $DUMMY parent ffff: handle 1 prio 1 protocol ipv4 matchall",
+        "matchPattern": "^filter parent ffff: protocol ip pref 1 matchall.*handle 0x1.*",
+        "matchCount": "0",
+        "teardown": [
+            "$TC qdisc del dev $DUMMY ingress",
+            "$IP link del dev $DUMMY type dummy",
+            "$TC actions del action police index 199"
+        ]
+    },
+    {
+        "id": "eee4",
+        "name": "Validate flags of the matchall filter with skip_sw and police action",
+        "category": [
+            "filter",
+            "matchall"
+        ],
+        "setup": [
+            "$IP link add dev $DUMMY type dummy || /bin/true",
+            "$TC qdisc add dev $DUMMY ingress",
+            "$TC actions flush action police",
+            "$TC actions add action police rate 1mbit burst 100k index 199"
+        ],
+        "cmdUnderTest": "$TC filter add dev $DUMMY parent ffff: handle 0x1 prio 1 protocol ipv4 matchall skip_sw action police index 199",
+        "expExitCode": "2",
+        "verifyCmd": "$TC filter get dev $DUMMY parent ffff: handle 1 prio 1 protocol ipv4 matchall",
+        "matchPattern": "^filter parent ffff: protocol ip pref 1 matchall.*handle 0x1.*",
+        "matchCount": "0",
+        "teardown": [
+            "$TC qdisc del dev $DUMMY ingress",
+            "$IP link del dev $DUMMY type dummy",
+            "$TC actions del action police index 199"
+        ]
     }
 ]
-- 
cgit v1.2.3


From 0b091a43d704997789c6d812b02167c8f5f9f061 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Thu, 16 Dec 2021 17:52:12 +0100
Subject: KVM: selftests: vmx_pmu_msrs_test: Drop tests mangling guest visible
 CPUIDs

Host initiated writes to MSR_IA32_PERF_CAPABILITIES should not depend
on guest visible CPUIDs and (incorrect) KVM logic implementing it is
about to change. Also, KVM_SET_CPUID{,2} after KVM_RUN is now forbidden
and causes test to fail.

Reported-by: kernel test robot <oliver.sang@intel.com>
Fixes: feb627e8d6f6 ("KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN")
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Message-Id: <20211216165213.338923-2-vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/x86_64/vmx_pmu_msrs_test.c | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/x86_64/vmx_pmu_msrs_test.c b/tools/testing/selftests/kvm/x86_64/vmx_pmu_msrs_test.c
index 23051d84b907..2454a1f2ca0c 100644
--- a/tools/testing/selftests/kvm/x86_64/vmx_pmu_msrs_test.c
+++ b/tools/testing/selftests/kvm/x86_64/vmx_pmu_msrs_test.c
@@ -110,22 +110,5 @@ int main(int argc, char *argv[])
 	ret = _vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, PMU_CAP_LBR_FMT);
 	TEST_ASSERT(ret == 0, "Bad PERF_CAPABILITIES didn't fail.");
 
-	/* testcase 4, set capabilities when we don't have PDCM bit */
-	entry_1_0->ecx &= ~X86_FEATURE_PDCM;
-	vcpu_set_cpuid(vm, VCPU_ID, cpuid);
-	ret = _vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities);
-	TEST_ASSERT(ret == 0, "Bad PERF_CAPABILITIES didn't fail.");
-
-	/* testcase 5, set capabilities when we don't have PMU version bits */
-	entry_1_0->ecx |= X86_FEATURE_PDCM;
-	eax.split.version_id = 0;
-	entry_1_0->ecx = eax.full;
-	vcpu_set_cpuid(vm, VCPU_ID, cpuid);
-	ret = _vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, PMU_CAP_FW_WRITES);
-	TEST_ASSERT(ret == 0, "Bad PERF_CAPABILITIES didn't fail.");
-
-	vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, 0);
-	ASSERT_EQ(vcpu_get_msr(vm, VCPU_ID, MSR_IA32_PERF_CAPABILITIES), 0);
-
 	kvm_vm_free(vm);
 }
-- 
cgit v1.2.3


From 577e022b7b41854911dcfb03678d8d2b930e8a3f Mon Sep 17 00:00:00 2001
From: Andrew Jones <drjones@redhat.com>
Date: Tue, 14 Dec 2021 16:18:42 +0100
Subject: selftests: KVM: Fix non-x86 compiling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Attempting to compile on a non-x86 architecture fails with

include/kvm_util.h: In function â€˜vm_compute_max_gfnâ€™:
include/kvm_util.h:79:21: error: dereferencing pointer to incomplete type â€˜struct kvm_vmâ€™
  return ((1ULL << vm->pa_bits) >> vm->page_shift) - 1;
                     ^~

This is because the declaration of struct kvm_vm is in
lib/kvm_util_internal.h as an effort to make it private to
the test lib code. We can still provide arch specific functions,
though, by making the generic function symbols weak. Do that to
fix the compile error.

Fixes: c8cc43c1eae2 ("selftests: KVM: avoid failures due to reserved HyperTransport region")
Cc: stable@vger.kernel.org
Signed-off-by: Andrew Jones <drjones@redhat.com>
Message-Id: <20211214151842.848314-1-drjones@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/include/kvm_util.h | 10 +---------
 tools/testing/selftests/kvm/lib/kvm_util.c     |  5 +++++
 2 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index da2b702da71a..2d62edc49d67 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -71,15 +71,6 @@ enum vm_guest_mode {
 
 #endif
 
-#if defined(__x86_64__)
-unsigned long vm_compute_max_gfn(struct kvm_vm *vm);
-#else
-static inline unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
-{
-	return ((1ULL << vm->pa_bits) >> vm->page_shift) - 1;
-}
-#endif
-
 #define MIN_PAGE_SIZE		(1U << MIN_PAGE_SHIFT)
 #define PTES_PER_MIN_PAGE	ptes_per_page(MIN_PAGE_SIZE)
 
@@ -330,6 +321,7 @@ bool vm_is_unrestricted_guest(struct kvm_vm *vm);
 
 unsigned int vm_get_page_size(struct kvm_vm *vm);
 unsigned int vm_get_page_shift(struct kvm_vm *vm);
+unsigned long vm_compute_max_gfn(struct kvm_vm *vm);
 uint64_t vm_get_max_gfn(struct kvm_vm *vm);
 int vm_get_fd(struct kvm_vm *vm);
 
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index daf6fdb217a7..53d2b5d04b82 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -2328,6 +2328,11 @@ unsigned int vm_get_page_shift(struct kvm_vm *vm)
 	return vm->page_shift;
 }
 
+unsigned long __attribute__((weak)) vm_compute_max_gfn(struct kvm_vm *vm)
+{
+	return ((1ULL << vm->pa_bits) >> vm->page_shift) - 1;
+}
+
 uint64_t vm_get_max_gfn(struct kvm_vm *vm)
 {
 	return vm->max_gfn;
-- 
cgit v1.2.3


From ab1ef34416a65ba11f66ae6435fcf0251cb46fd4 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 7 Dec 2021 19:30:06 +0000
Subject: KVM: selftests: Add test to verify TRIPLE_FAULT on invalid L2 guest
 state

Add a selftest to attempt to enter L2 with invalid guests state by
exiting to userspace via I/O from L2, and then using KVM_SET_SREGS to set
invalid guest state (marking TR unusable is arbitrary chosen for its
relative simplicity).

This is a regression test for a bug introduced by commit c8607e4a086f
("KVM: x86: nVMX: don't fail nested VM entry on invalid guest state if
!from_vmentry"), which incorrectly set vmx->fail=true when L2 had invalid
guest state and ultimately triggered a WARN due to nested_vmx_vmexit()
seeing vmx->fail==true while attempting to synthesize a nested VM-Exit.

The is also a functional test to verify that KVM sythesizes TRIPLE_FAULT
for L2, which is somewhat arbitrary behavior, instead of emulating L2.
KVM should never emulate L2 due to invalid guest state, as it's
architecturally impossible for L1 to run an L2 guest with invalid state
as nested VM-Enter should always fail, i.e. L1 needs to do the emulation.
Stuffing state via KVM ioctl() is a non-architctural, out-of-band case,
hence the TRIPLE_FAULT being rather arbitrary.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20211207193006.120997-5-seanjc@google.com>
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/.gitignore             |   1 +
 tools/testing/selftests/kvm/Makefile               |   1 +
 .../kvm/x86_64/vmx_invalid_nested_guest_state.c    | 105 +++++++++++++++++++++
 3 files changed, 107 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index 00814c0f87a6..3cb5ac5da087 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -35,6 +35,7 @@
 /x86_64/vmx_apic_access_test
 /x86_64/vmx_close_while_nested_test
 /x86_64/vmx_dirty_log_test
+/x86_64/vmx_invalid_nested_guest_state
 /x86_64/vmx_preemption_timer_test
 /x86_64/vmx_set_nested_state_test
 /x86_64/vmx_tsc_adjust_test
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index f307c9f61981..17342b575e85 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -64,6 +64,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_invalid_nested_guest_state
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_nested_tsc_scaling_test
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c b/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c
new file mode 100644
index 000000000000..489fbed4ca6f
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "kselftest.h"
+
+#define VCPU_ID	0
+#define ARBITRARY_IO_PORT 0x2000
+
+static struct kvm_vm *vm;
+
+static void l2_guest_code(void)
+{
+	/*
+	 * Generate an exit to L0 userspace, i.e. main(), via I/O to an
+	 * arbitrary port.
+	 */
+	asm volatile("inb %%dx, %%al"
+		     : : [port] "d" (ARBITRARY_IO_PORT) : "rax");
+}
+
+static void l1_guest_code(struct vmx_pages *vmx_pages)
+{
+#define L2_GUEST_STACK_SIZE 64
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+
+	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+	GUEST_ASSERT(load_vmcs(vmx_pages));
+
+	/* Prepare the VMCS for L2 execution. */
+	prepare_vmcs(vmx_pages, l2_guest_code,
+		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	/*
+	 * L2 must be run without unrestricted guest, verify that the selftests
+	 * library hasn't enabled it.  Because KVM selftests jump directly to
+	 * 64-bit mode, unrestricted guest support isn't required.
+	 */
+	GUEST_ASSERT(!(vmreadz(CPU_BASED_VM_EXEC_CONTROL) & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) ||
+		     !(vmreadz(SECONDARY_VM_EXEC_CONTROL) & SECONDARY_EXEC_UNRESTRICTED_GUEST));
+
+	GUEST_ASSERT(!vmlaunch());
+
+	/* L2 should triple fault after main() stuffs invalid guest state. */
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_TRIPLE_FAULT);
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	vm_vaddr_t vmx_pages_gva;
+	struct kvm_sregs sregs;
+	struct kvm_run *run;
+	struct ucall uc;
+
+	nested_vmx_check_supported();
+
+	vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code);
+
+	/* Allocate VMX pages and shared descriptors (vmx_pages). */
+	vcpu_alloc_vmx(vm, &vmx_pages_gva);
+	vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
+
+	vcpu_run(vm, VCPU_ID);
+
+	run = vcpu_state(vm, VCPU_ID);
+
+	/*
+	 * The first exit to L0 userspace should be an I/O access from L2.
+	 * Running L1 should launch L2 without triggering an exit to userspace.
+	 */
+	TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+		    "Expected KVM_EXIT_IO, got: %u (%s)\n",
+		    run->exit_reason, exit_reason_str(run->exit_reason));
+
+	TEST_ASSERT(run->io.port == ARBITRARY_IO_PORT,
+		    "Expected IN from port %d from L2, got port %d",
+		    ARBITRARY_IO_PORT, run->io.port);
+
+	/*
+	 * Stuff invalid guest state for L2 by making TR unusuable.  The next
+	 * KVM_RUN should induce a TRIPLE_FAULT in L2 as KVM doesn't support
+	 * emulating invalid guest state for L2.
+	 */
+	memset(&sregs, 0, sizeof(sregs));
+	vcpu_sregs_get(vm, VCPU_ID, &sregs);
+	sregs.tr.unusable = 1;
+	vcpu_sregs_set(vm, VCPU_ID, &sregs);
+
+	vcpu_run(vm, VCPU_ID);
+
+	switch (get_ucall(vm, VCPU_ID, &uc)) {
+	case UCALL_DONE:
+		break;
+	case UCALL_ABORT:
+		TEST_FAIL("%s", (const char *)uc.args[0]);
+	default:
+		TEST_FAIL("Unexpected ucall: %lu", uc.cmd);
+	}
+}
-- 
cgit v1.2.3


From 426b87b111b0523f957354bc97ec7eb16e8be1e2 Mon Sep 17 00:00:00 2001
From: Pu Lehui <pulehui@huawei.com>
Date: Mon, 20 Dec 2021 05:08:03 +0000
Subject: selftests/bpf: Correct the INDEX address in vmtest.sh

Migration of vmtest to libbpf/ci will change the address
of INDEX in vmtest.sh, which will cause vmtest.sh to not
work due to the failure of rootfs fetching.

Signed-off-by: Pu Lehui <pulehui@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Lorenzo Fontana <lorenzo.fontana@elastic.co>
Link: https://lore.kernel.org/bpf/20211220050803.2670677-1-pulehui@huawei.com
---
 tools/testing/selftests/bpf/vmtest.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/vmtest.sh b/tools/testing/selftests/bpf/vmtest.sh
index 5e43c79ddc6e..b3afd43549fa 100755
--- a/tools/testing/selftests/bpf/vmtest.sh
+++ b/tools/testing/selftests/bpf/vmtest.sh
@@ -32,7 +32,7 @@ ROOTFS_IMAGE="root.img"
 OUTPUT_DIR="$HOME/.bpf_selftests"
 KCONFIG_URL="https://raw.githubusercontent.com/libbpf/libbpf/master/travis-ci/vmtest/configs/config-latest.${ARCH}"
 KCONFIG_API_URL="https://api.github.com/repos/libbpf/libbpf/contents/travis-ci/vmtest/configs/config-latest.${ARCH}"
-INDEX_URL="https://raw.githubusercontent.com/libbpf/libbpf/master/travis-ci/vmtest/configs/INDEX"
+INDEX_URL="https://raw.githubusercontent.com/libbpf/ci/master/INDEX"
 NUM_COMPILE_JOBS="$(nproc)"
 LOG_FILE_BASE="$(date +"bpf_selftests.%Y-%m-%d_%H-%M-%S")"
 LOG_FILE="${LOG_FILE_BASE}.log"
-- 
cgit v1.2.3


From a8968521cfdc3e339fe69473d6632e0aa8d7202a Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 9 Dec 2021 22:59:44 +1100
Subject: selftests/powerpc: Add a test of sigreturning to the kernel

We have a general signal fuzzer, sigfuz, which can modify the MSR & NIP
before sigreturn. But the chance of it hitting a kernel address and also
clearing MSR_PR is fairly slim.

So add a specific test of sigreturn to a kernel address, both with and
without attempting to clear MSR_PR (which the kernel must block).

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20211209115944.4062384-1-mpe@ellerman.id.au
---
 tools/testing/selftests/powerpc/signal/.gitignore  |   1 +
 tools/testing/selftests/powerpc/signal/Makefile    |   1 +
 .../selftests/powerpc/signal/sigreturn_kernel.c    | 132 +++++++++++++++++++++
 3 files changed, 134 insertions(+)
 create mode 100644 tools/testing/selftests/powerpc/signal/sigreturn_kernel.c

(limited to 'tools')

diff --git a/tools/testing/selftests/powerpc/signal/.gitignore b/tools/testing/selftests/powerpc/signal/.gitignore
index ce3375cd8e73..8f6c816099a4 100644
--- a/tools/testing/selftests/powerpc/signal/.gitignore
+++ b/tools/testing/selftests/powerpc/signal/.gitignore
@@ -4,3 +4,4 @@ signal_tm
 sigfuz
 sigreturn_vdso
 sig_sc_double_restart
+sigreturn_kernel
diff --git a/tools/testing/selftests/powerpc/signal/Makefile b/tools/testing/selftests/powerpc/signal/Makefile
index d6ae54663aed..84e201572466 100644
--- a/tools/testing/selftests/powerpc/signal/Makefile
+++ b/tools/testing/selftests/powerpc/signal/Makefile
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 TEST_GEN_PROGS := signal signal_tm sigfuz sigreturn_vdso sig_sc_double_restart
+TEST_GEN_PROGS += sigreturn_kernel
 
 CFLAGS += -maltivec
 $(OUTPUT)/signal_tm: CFLAGS += -mhtm
diff --git a/tools/testing/selftests/powerpc/signal/sigreturn_kernel.c b/tools/testing/selftests/powerpc/signal/sigreturn_kernel.c
new file mode 100644
index 000000000000..0a1b6e591eee
--- /dev/null
+++ b/tools/testing/selftests/powerpc/signal/sigreturn_kernel.c
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test that we can't sigreturn to kernel addresses, or to kernel mode.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "utils.h"
+
+#define MSR_PR (1ul << 14)
+
+static volatile unsigned long long sigreturn_addr;
+static volatile unsigned long long sigreturn_msr_mask;
+
+static void sigusr1_handler(int signo, siginfo_t *si, void *uc_ptr)
+{
+	ucontext_t *uc = (ucontext_t *)uc_ptr;
+
+	if (sigreturn_addr)
+		UCONTEXT_NIA(uc) = sigreturn_addr;
+
+	if (sigreturn_msr_mask)
+		UCONTEXT_MSR(uc) &= sigreturn_msr_mask;
+}
+
+static pid_t fork_child(void)
+{
+	pid_t pid;
+
+	pid = fork();
+	if (pid == 0) {
+		raise(SIGUSR1);
+		exit(0);
+	}
+
+	return pid;
+}
+
+static int expect_segv(pid_t pid)
+{
+	int child_ret;
+
+	waitpid(pid, &child_ret, 0);
+	FAIL_IF(WIFEXITED(child_ret));
+	FAIL_IF(!WIFSIGNALED(child_ret));
+	FAIL_IF(WTERMSIG(child_ret) != 11);
+
+	return 0;
+}
+
+int test_sigreturn_kernel(void)
+{
+	struct sigaction act;
+	int child_ret, i;
+	pid_t pid;
+
+	act.sa_sigaction = sigusr1_handler;
+	act.sa_flags = SA_SIGINFO;
+	sigemptyset(&act.sa_mask);
+
+	FAIL_IF(sigaction(SIGUSR1, &act, NULL));
+
+	for (i = 0; i < 2; i++) {
+		// Return to kernel
+		sigreturn_addr = 0xcull << 60;
+		pid = fork_child();
+		expect_segv(pid);
+
+		// Return to kernel virtual
+		sigreturn_addr = 0xc008ull << 48;
+		pid = fork_child();
+		expect_segv(pid);
+
+		// Return out of range
+		sigreturn_addr = 0xc010ull << 48;
+		pid = fork_child();
+		expect_segv(pid);
+
+		// Return to no-man's land, just below PAGE_OFFSET
+		sigreturn_addr = (0xcull << 60) - (64 * 1024);
+		pid = fork_child();
+		expect_segv(pid);
+
+		// Return to no-man's land, above TASK_SIZE_4PB
+		sigreturn_addr = 0x1ull << 52;
+		pid = fork_child();
+		expect_segv(pid);
+
+		// Return to 0xd space
+		sigreturn_addr = 0xdull << 60;
+		pid = fork_child();
+		expect_segv(pid);
+
+		// Return to 0xe space
+		sigreturn_addr = 0xeull << 60;
+		pid = fork_child();
+		expect_segv(pid);
+
+		// Return to 0xf space
+		sigreturn_addr = 0xfull << 60;
+		pid = fork_child();
+		expect_segv(pid);
+
+		// Attempt to set PR=0 for 2nd loop (should be blocked by kernel)
+		sigreturn_msr_mask = ~MSR_PR;
+	}
+
+	printf("All children killed as expected\n");
+
+	// Don't change address, just MSR, should return to user as normal
+	sigreturn_addr = 0;
+	sigreturn_msr_mask = ~MSR_PR;
+	pid = fork_child();
+	waitpid(pid, &child_ret, 0);
+	FAIL_IF(!WIFEXITED(child_ret));
+	FAIL_IF(WIFSIGNALED(child_ret));
+	FAIL_IF(WEXITSTATUS(child_ret) != 0);
+
+	return 0;
+}
+
+int main(void)
+{
+	return test_harness(test_sigreturn_kernel, "sigreturn_kernel");
+}
-- 
cgit v1.2.3


From f8464e084dd3c4cf37bdbeb06fea0afbd2e0f4e8 Mon Sep 17 00:00:00 2001
From: Carsten Haitzler <carsten.haitzler@arm.com>
Date: Wed, 15 Dec 2021 16:03:54 +0000
Subject: perf test: Use 3 digits for test numbering now we can have more tests

This is in preparation for adding more tests that will need the test
number to be 3 digts so they align nicely in the output.

Reviewed-by: Leo Yan <leo.yan@linaro.org>
Signed-off-by: Carsten Haitzler <carsten.haitzler@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: coresight@lists.linaro.org
Link: http://lore.kernel.org/lkml/20211215160403.69264-3-carsten.haitzler@foss.arm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/builtin-test.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index 1fb9f2a11d63..fac3717d9ba1 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -421,7 +421,7 @@ static int run_shell_tests(int argc, const char *argv[], int i, int width,
 			continue;
 
 		st.file = ent->d_name;
-		pr_info("%2d: %-*s:", i, width, test_suite.desc);
+		pr_info("%3d: %-*s:", i, width, test_suite.desc);
 
 		if (intlist__find(skiplist, i)) {
 			color_fprintf(stderr, PERF_COLOR_YELLOW, " Skip (user override)\n");
@@ -471,7 +471,7 @@ static int __cmd_test(int argc, const char *argv[], struct intlist *skiplist)
 				continue;
 		}
 
-		pr_info("%2d: %-*s:", i, width, test_description(t, -1));
+		pr_info("%3d: %-*s:", i, width, test_description(t, -1));
 
 		if (intlist__find(skiplist, i)) {
 			color_fprintf(stderr, PERF_COLOR_YELLOW, " Skip (user override)\n");
@@ -511,7 +511,7 @@ static int __cmd_test(int argc, const char *argv[], struct intlist *skiplist)
 							curr, argc, argv))
 					continue;
 
-				pr_info("%2d.%1d: %-*s:", i, subi + 1, subw,
+				pr_info("%3d.%1d: %-*s:", i, subi + 1, subw,
 					test_description(t, subi));
 				test_and_print(t, subi);
 			}
@@ -546,7 +546,7 @@ static int perf_test__list_shell(int argc, const char **argv, int i)
 		if (!perf_test__matches(t.desc, curr, argc, argv))
 			continue;
 
-		pr_info("%2d: %s\n", i, t.desc);
+		pr_info("%3d: %s\n", i, t.desc);
 
 	}
 
@@ -568,14 +568,14 @@ static int perf_test__list(int argc, const char **argv)
 		if (!perf_test__matches(test_description(t, -1), curr, argc, argv))
 			continue;
 
-		pr_info("%2d: %s\n", i, test_description(t, -1));
+		pr_info("%3d: %s\n", i, test_description(t, -1));
 
 		if (has_subtests(t)) {
 			int subn = num_subtests(t);
 			int subi;
 
 			for (subi = 0; subi < subn; subi++)
-				pr_info("%2d:%1d: %s\n", i, subi + 1,
+				pr_info("%3d:%1d: %s\n", i, subi + 1,
 					test_description(t, subi));
 		}
 	}
-- 
cgit v1.2.3


From 7248e308a57587615431b83689cd57e957815bfc Mon Sep 17 00:00:00 2001
From: Alexandre Truong <alexandre.truong@arm.com>
Date: Fri, 17 Dec 2021 15:45:15 +0000
Subject: perf tools: Record ARM64 LR register automatically

On ARM64, automatically record the link register if the frame pointer
mode is on. It will be used to do a dwarf unwind to find the caller of
the leaf frame if the frame pointer was omitted.

Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Alexandre Truong <alexandre.truong@arm.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20211217154521.80603-2-german.gomez@arm.com
Signed-off-by: German Gomez <german.gomez@arm.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/arm64/util/machine.c | 7 +++++++
 tools/perf/builtin-record.c          | 8 ++++++++
 tools/perf/util/callchain.h          | 2 ++
 3 files changed, 17 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/arch/arm64/util/machine.c b/tools/perf/arch/arm64/util/machine.c
index 7e7714290a87..d2ce31e28cd7 100644
--- a/tools/perf/arch/arm64/util/machine.c
+++ b/tools/perf/arch/arm64/util/machine.c
@@ -5,6 +5,8 @@
 #include <string.h>
 #include "debug.h"
 #include "symbol.h"
+#include "callchain.h"
+#include "record.h"
 
 /* On arm64, kernel text segment starts at high memory address,
  * for example 0xffff 0000 8xxx xxxx. Modules start at a low memory
@@ -26,3 +28,8 @@ void arch__symbols__fixup_end(struct symbol *p, struct symbol *c)
 		p->end = c->start;
 	pr_debug4("%s sym:%s end:%#" PRIx64 "\n", __func__, p->name, p->end);
 }
+
+void arch__add_leaf_frame_record_opts(struct record_opts *opts)
+{
+	opts->sample_user_regs |= sample_reg_masks[PERF_REG_ARM64_LR].mask;
+}
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 0338b813585a..6ac2160913ea 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -2267,6 +2267,10 @@ out_free:
 	return ret;
 }
 
+void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
+{
+}
+
 static int parse_control_option(const struct option *opt,
 				const char *str,
 				int unset __maybe_unused)
@@ -2898,6 +2902,10 @@ int cmd_record(int argc, const char **argv)
 	}
 
 	rec->opts.target.hybrid = perf_pmu__has_hybrid();
+
+	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
+		arch__add_leaf_frame_record_opts(&rec->opts);
+
 	err = -ENOMEM;
 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
 		usage_with_options(record_usage, record_options);
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 5824134f983b..77fba053c677 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -280,6 +280,8 @@ static inline int arch_skip_callchain_idx(struct thread *thread __maybe_unused,
 }
 #endif
 
+void arch__add_leaf_frame_record_opts(struct record_opts *opts);
+
 char *callchain_list__sym_name(struct callchain_list *cl,
 			       char *bf, size_t bfsize, bool show_dso);
 char *callchain_node__scnprintf_value(struct callchain_node *node,
-- 
cgit v1.2.3


From 32bfa5bf71db672c646751da131a17aace8cceac Mon Sep 17 00:00:00 2001
From: Alexandre Truong <alexandre.truong@arm.com>
Date: Fri, 17 Dec 2021 15:45:16 +0000
Subject: perf machine: Add a mechanism to inject stack frames

Add a mechanism for platforms to inject stack frames for the leaf
frame caller if there is enough information to determine a frame
is missing from dwarf or other post processing mechanisms.

Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Alexandre Truong <alexandre.truong@arm.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20211217154521.80603-3-german.gomez@arm.com
Signed-off-by: German Gomez <german.gomez@arm.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/machine.c | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index fb8496df8432..3eddad009f78 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -2710,6 +2710,12 @@ static int find_prev_cpumode(struct ip_callchain *chain, struct thread *thread,
 	return err;
 }
 
+static u64 get_leaf_frame_caller(struct perf_sample *sample __maybe_unused,
+		struct thread *thread __maybe_unused, int usr_idx __maybe_unused)
+{
+	return 0;
+}
+
 static int thread__resolve_callchain_sample(struct thread *thread,
 					    struct callchain_cursor *cursor,
 					    struct evsel *evsel,
@@ -2723,9 +2729,10 @@ static int thread__resolve_callchain_sample(struct thread *thread,
 	struct ip_callchain *chain = sample->callchain;
 	int chain_nr = 0;
 	u8 cpumode = PERF_RECORD_MISC_USER;
-	int i, j, err, nr_entries;
+	int i, j, err, nr_entries, usr_idx;
 	int skip_idx = -1;
 	int first_call = 0;
+	u64 leaf_frame_caller;
 
 	if (chain)
 		chain_nr = chain->nr;
@@ -2850,6 +2857,34 @@ check_calls:
 			continue;
 		}
 
+		/*
+		 * PERF_CONTEXT_USER allows us to locate where the user stack ends.
+		 * Depending on callchain_param.order and the position of PERF_CONTEXT_USER,
+		 * the index will be different in order to add the missing frame
+		 * at the right place.
+		 */
+
+		usr_idx = callchain_param.order == ORDER_CALLEE ? j-2 : j-1;
+
+		if (usr_idx >= 0 && chain->ips[usr_idx] == PERF_CONTEXT_USER) {
+
+			leaf_frame_caller = get_leaf_frame_caller(sample, thread, usr_idx);
+
+			/*
+			 * check if leaf_frame_Caller != ip to not add the same
+			 * value twice.
+			 */
+
+			if (leaf_frame_caller && leaf_frame_caller != ip) {
+
+				err = add_callchain_ip(thread, cursor, parent,
+					       root_al, &cpumode, leaf_frame_caller,
+					       false, NULL, NULL, 0);
+				if (err)
+					return (err < 0) ? err : 0;
+			}
+		}
+
 		err = add_callchain_ip(thread, cursor, parent,
 				       root_al, &cpumode, ip,
 				       false, NULL, NULL, 0);
-- 
cgit v1.2.3


From ab23692134489f0f563168449fc27bfb5d6b04dd Mon Sep 17 00:00:00 2001
From: Alexandre Truong <alexandre.truong@arm.com>
Date: Fri, 17 Dec 2021 15:45:17 +0000
Subject: perf script: Use callchain_param_setup() instead of open coded
 equivalent

Refactoring script__setup_sample_type() by using callchain_param_setup()
to replace the duplicate code for callchain parameter setting up.

Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Alexandre Truong <alexandre.truong@arm.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20211217154521.80603-4-german.gomez@arm.com
Signed-off-by: German Gomez <german.gomez@arm.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-script.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index da2175d70ac9..ab7d575f97f2 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -3468,16 +3468,7 @@ static void script__setup_sample_type(struct perf_script *script)
 	struct perf_session *session = script->session;
 	u64 sample_type = evlist__combined_sample_type(session->evlist);
 
-	if (symbol_conf.use_callchain || symbol_conf.cumulate_callchain) {
-		if ((sample_type & PERF_SAMPLE_REGS_USER) &&
-		    (sample_type & PERF_SAMPLE_STACK_USER)) {
-			callchain_param.record_mode = CALLCHAIN_DWARF;
-			dwarf_callchain_users = true;
-		} else if (sample_type & PERF_SAMPLE_BRANCH_STACK)
-			callchain_param.record_mode = CALLCHAIN_LBR;
-		else
-			callchain_param.record_mode = CALLCHAIN_FP;
-	}
+	callchain_param_setup(sample_type);
 
 	if (script->stitch_lbr && (callchain_param.record_mode != CALLCHAIN_LBR)) {
 		pr_warning("Can't find LBR callchain. Switch off --stitch-lbr.\n"
-- 
cgit v1.2.3


From aa8db3e41dae953b636ff68e944479900d149a37 Mon Sep 17 00:00:00 2001
From: Alexandre Truong <alexandre.truong@arm.com>
Date: Fri, 17 Dec 2021 15:45:18 +0000
Subject: perf callchain: Enable dwarf_callchain_users on arm64

Enable dwarf_callchain_users on arm64 which will be needed to do a
DWARF unwind in order to get the caller of the leaf frame.

Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Alexandre Truong <alexandre.truong@arm.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20211217154521.80603-5-german.gomez@arm.com
Signed-off-by: German Gomez <german.gomez@arm.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-report.c |  4 ++--
 tools/perf/builtin-script.c |  4 ++--
 tools/perf/util/callchain.c | 14 +++++++++++++-
 tools/perf/util/callchain.h |  2 +-
 4 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 8ae400429870..1dd92d8c9279 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -410,7 +410,7 @@ static int report__setup_sample_type(struct report *rep)
 		}
 	}
 
-	callchain_param_setup(sample_type);
+	callchain_param_setup(sample_type, perf_env__arch(&rep->session->header.env));
 
 	if (rep->stitch_lbr && (callchain_param.record_mode != CALLCHAIN_LBR)) {
 		ui__warning("Can't find LBR callchain. Switch off --stitch-lbr.\n"
@@ -1127,7 +1127,7 @@ static int process_attr(struct perf_tool *tool __maybe_unused,
 	 * on events sample_type.
 	 */
 	sample_type = evlist__combined_sample_type(*pevlist);
-	callchain_param_setup(sample_type);
+	callchain_param_setup(sample_type, perf_env__arch((*pevlist)->env));
 	return 0;
 }
 
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index ab7d575f97f2..d308adfd1176 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -2318,7 +2318,7 @@ static int process_attr(struct perf_tool *tool, union perf_event *event,
 	 * on events sample_type.
 	 */
 	sample_type = evlist__combined_sample_type(evlist);
-	callchain_param_setup(sample_type);
+	callchain_param_setup(sample_type, perf_env__arch((*pevlist)->env));
 
 	/* Enable fields for callchain entries */
 	if (symbol_conf.use_callchain &&
@@ -3468,7 +3468,7 @@ static void script__setup_sample_type(struct perf_script *script)
 	struct perf_session *session = script->session;
 	u64 sample_type = evlist__combined_sample_type(session->evlist);
 
-	callchain_param_setup(sample_type);
+	callchain_param_setup(sample_type, perf_env__arch(session->machines.host.env));
 
 	if (script->stitch_lbr && (callchain_param.record_mode != CALLCHAIN_LBR)) {
 		pr_warning("Can't find LBR callchain. Switch off --stitch-lbr.\n"
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 8e2777133bd9..131207b91d15 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -1600,7 +1600,7 @@ void callchain_cursor_reset(struct callchain_cursor *cursor)
 		map__zput(node->ms.map);
 }
 
-void callchain_param_setup(u64 sample_type)
+void callchain_param_setup(u64 sample_type, const char *arch)
 {
 	if (symbol_conf.use_callchain || symbol_conf.cumulate_callchain) {
 		if ((sample_type & PERF_SAMPLE_REGS_USER) &&
@@ -1612,6 +1612,18 @@ void callchain_param_setup(u64 sample_type)
 		else
 			callchain_param.record_mode = CALLCHAIN_FP;
 	}
+
+	/*
+	 * It's necessary to use libunwind to reliably determine the caller of
+	 * a leaf function on aarch64, as otherwise we cannot know whether to
+	 * start from the LR or FP.
+	 *
+	 * Always starting from the LR can result in duplicate or entirely
+	 * erroneous entries. Always skipping the LR and starting from the FP
+	 * can result in missing entries.
+	 */
+	if (callchain_param.record_mode == CALLCHAIN_FP && !strcmp(arch, "arm64"))
+		dwarf_callchain_users = true;
 }
 
 static bool chain_match(struct callchain_list *base_chain,
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 77fba053c677..d95615daed73 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -300,7 +300,7 @@ int callchain_branch_counts(struct callchain_root *root,
 			    u64 *branch_count, u64 *predicted_count,
 			    u64 *abort_count, u64 *cycles_count);
 
-void callchain_param_setup(u64 sample_type);
+void callchain_param_setup(u64 sample_type, const char *arch);
 
 bool callchain_cnode_matched(struct callchain_node *base_cnode,
 			     struct callchain_node *pair_cnode);
-- 
cgit v1.2.3


From ffc60350489db9f6e3010ac1e795078cb0d06efe Mon Sep 17 00:00:00 2001
From: German Gomez <german.gomez@arm.com>
Date: Fri, 17 Dec 2021 15:45:19 +0000
Subject: perf tools: Refactor SMPL_REG macro in perf_regs.h

Refactor the SAMPL_REG macro so that it can be used in a followup commit
to obtain the masks for ARM64 registers.

Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: German Gomez <german.gomez@arm.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20211217154521.80603-6-german.gomez@arm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/perf_regs.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/perf_regs.h b/tools/perf/util/perf_regs.h
index 4e6b1299c571..ce1127af05e4 100644
--- a/tools/perf/util/perf_regs.h
+++ b/tools/perf/util/perf_regs.h
@@ -11,8 +11,11 @@ struct sample_reg {
 	const char *name;
 	uint64_t mask;
 };
-#define SMPL_REG(n, b) { .name = #n, .mask = 1ULL << (b) }
-#define SMPL_REG2(n, b) { .name = #n, .mask = 3ULL << (b) }
+
+#define SMPL_REG_MASK(b) (1ULL << (b))
+#define SMPL_REG(n, b) { .name = #n, .mask = SMPL_REG_MASK(b) }
+#define SMPL_REG2_MASK(b) (3ULL << (b))
+#define SMPL_REG2(n, b) { .name = #n, .mask = SMPL_REG2_MASK(b) }
 #define SMPL_REG_END { .name = NULL }
 
 enum {
-- 
cgit v1.2.3


From b9f6fbb3b2c29736970ae9fcc0e82b0bd459442b Mon Sep 17 00:00:00 2001
From: Alexandre Truong <alexandre.truong@arm.com>
Date: Fri, 17 Dec 2021 15:45:20 +0000
Subject: perf arm64: Inject missing frames when using 'perf record
 --call-graph=fp'

When unwinding using frame pointers on ARM64, the return address of the
current function may not have been pushed into the stack when a function
was interrupted, which makes perf show an incorrect call graph to the
user.

Consider the following example program:

  void leaf() {
      /* long computation */
  }

  void parent() {
      // (1)
      leaf();
      // (2)
  }

  ... could be compiled into (using gcc -fno-inline -fno-omit-frame-pointer):

  leaf:
      /* long computation */
      nop
      ret
  parent:
      // (1)
      stp     x29, x30, [sp, -16]!
      mov     x29, sp
      bl      parent
      nop
      ldp     x29, x30, [sp], 16
      // (2)
      ret

If the program is interrupted at (1), (2), or any point in "leaf:", the
call graph will skip the callers of the current function. We can unwind
using the dwarf info and check if the return addr is the same as the LR
register, and inject the missing frame into the call graph.

Before this patch, the above example shows the following call-graph when
recording using "--call-graph fp" mode in ARM64:

  # Children      Self  Command   Shared Object     Symbol
  # ........  ........  ........  ................  ......................
  #
      99.86%    99.86%  program3  program3          [.] leaf
  	    |
  	    ---_start
  	       __libc_start_main
  	       main
  	       leaf

As can be seen, the "parent" function is missing. This is specially
problematic in "leaf" because for leaf functions the compiler may always
omit pushing the return addr into the stack. After this patch, it shows
the correct graph:

  # Children      Self  Command   Shared Object     Symbol
  # ........  ........  ........  ................  ......................
  #
      99.86%    99.86%  program3  program3          [.] leaf
  	    |
  	    ---_start
  	       __libc_start_main
  	       main
  	       parent
  	       leaf

Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Alexandre Truong <alexandre.truong@arm.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20211217154521.80603-7-german.gomez@arm.com
Signed-off-by: German Gomez <german.gomez@arm.com>
[ Rename machine__normalize_is() to machine__normalized_is(), as suggested by James Clark ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/Build                              |  1 +
 .../perf/util/arm64-frame-pointer-unwind-support.c | 63 ++++++++++++++++++++++
 .../perf/util/arm64-frame-pointer-unwind-support.h | 10 ++++
 tools/perf/util/machine.c                          | 19 +++++--
 tools/perf/util/machine.h                          |  1 +
 5 files changed, 89 insertions(+), 5 deletions(-)
 create mode 100644 tools/perf/util/arm64-frame-pointer-unwind-support.c
 create mode 100644 tools/perf/util/arm64-frame-pointer-unwind-support.h

(limited to 'tools')

diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 294b12430d73..2a403cefcaf2 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -1,3 +1,4 @@
+perf-y += arm64-frame-pointer-unwind-support.o
 perf-y += annotate.o
 perf-y += block-info.o
 perf-y += block-range.o
diff --git a/tools/perf/util/arm64-frame-pointer-unwind-support.c b/tools/perf/util/arm64-frame-pointer-unwind-support.c
new file mode 100644
index 000000000000..4f5ecf51ed38
--- /dev/null
+++ b/tools/perf/util/arm64-frame-pointer-unwind-support.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "arm64-frame-pointer-unwind-support.h"
+#include "callchain.h"
+#include "event.h"
+#include "perf_regs.h" // SMPL_REG_MASK
+#include "unwind.h"
+
+#define perf_event_arm_regs perf_event_arm64_regs
+#include "../arch/arm64/include/uapi/asm/perf_regs.h"
+#undef perf_event_arm_regs
+
+struct entries {
+	u64 stack[2];
+	size_t length;
+};
+
+static bool get_leaf_frame_caller_enabled(struct perf_sample *sample)
+{
+	return callchain_param.record_mode == CALLCHAIN_FP && sample->user_regs.regs
+		&& sample->user_regs.mask & SMPL_REG_MASK(PERF_REG_ARM64_LR);
+}
+
+static int add_entry(struct unwind_entry *entry, void *arg)
+{
+	struct entries *entries = arg;
+
+	entries->stack[entries->length++] = entry->ip;
+	return 0;
+}
+
+u64 get_leaf_frame_caller_aarch64(struct perf_sample *sample, struct thread *thread, int usr_idx)
+{
+	int ret;
+	struct entries entries = {};
+	struct regs_dump old_regs = sample->user_regs;
+
+	if (!get_leaf_frame_caller_enabled(sample))
+		return 0;
+
+	/*
+	 * If PC and SP are not recorded, get the value of PC from the stack
+	 * and set its mask. SP is not used when doing the unwinding but it
+	 * still needs to be set to prevent failures.
+	 */
+
+	if (!(sample->user_regs.mask & SMPL_REG_MASK(PERF_REG_ARM64_PC))) {
+		sample->user_regs.cache_mask |= SMPL_REG_MASK(PERF_REG_ARM64_PC);
+		sample->user_regs.cache_regs[PERF_REG_ARM64_PC] = sample->callchain->ips[usr_idx+1];
+	}
+
+	if (!(sample->user_regs.mask & SMPL_REG_MASK(PERF_REG_ARM64_SP))) {
+		sample->user_regs.cache_mask |= SMPL_REG_MASK(PERF_REG_ARM64_SP);
+		sample->user_regs.cache_regs[PERF_REG_ARM64_SP] = 0;
+	}
+
+	ret = unwind__get_entries(add_entry, &entries, thread, sample, 2);
+	sample->user_regs = old_regs;
+
+	if (ret || entries.length != 2)
+		return ret;
+
+	return callchain_param.order == ORDER_CALLER ? entries.stack[0] : entries.stack[1];
+}
diff --git a/tools/perf/util/arm64-frame-pointer-unwind-support.h b/tools/perf/util/arm64-frame-pointer-unwind-support.h
new file mode 100644
index 000000000000..32af9ce94398
--- /dev/null
+++ b/tools/perf/util/arm64-frame-pointer-unwind-support.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __PERF_ARM_FRAME_POINTER_UNWIND_SUPPORT_H
+#define __PERF_ARM_FRAME_POINTER_UNWIND_SUPPORT_H
+
+#include "event.h"
+#include "thread.h"
+
+u64 get_leaf_frame_caller_aarch64(struct perf_sample *sample, struct thread *thread, int user_idx);
+
+#endif /* __PERF_ARM_FRAME_POINTER_UNWIND_SUPPORT_H */
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 3eddad009f78..3901440aeff9 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -34,6 +34,7 @@
 #include "bpf-event.h"
 #include <internal/lib.h> // page_size
 #include "cgroup.h"
+#include "arm64-frame-pointer-unwind-support.h"
 
 #include <linux/ctype.h>
 #include <symbol/kallsyms.h>
@@ -2710,10 +2711,13 @@ static int find_prev_cpumode(struct ip_callchain *chain, struct thread *thread,
 	return err;
 }
 
-static u64 get_leaf_frame_caller(struct perf_sample *sample __maybe_unused,
-		struct thread *thread __maybe_unused, int usr_idx __maybe_unused)
+static u64 get_leaf_frame_caller(struct perf_sample *sample,
+		struct thread *thread, int usr_idx)
 {
-	return 0;
+	if (machine__normalized_is(thread->maps->machine, "arm64"))
+		return get_leaf_frame_caller_aarch64(sample, thread, usr_idx);
+	else
+		return 0;
 }
 
 static int thread__resolve_callchain_sample(struct thread *thread,
@@ -3114,14 +3118,19 @@ int machine__set_current_tid(struct machine *machine, int cpu, pid_t pid,
 }
 
 /*
- * Compares the raw arch string. N.B. see instead perf_env__arch() if a
- * normalized arch is needed.
+ * Compares the raw arch string. N.B. see instead perf_env__arch() or
+ * machine__normalized_is() if a normalized arch is needed.
  */
 bool machine__is(struct machine *machine, const char *arch)
 {
 	return machine && !strcmp(perf_env__raw_arch(machine->env), arch);
 }
 
+bool machine__normalized_is(struct machine *machine, const char *arch)
+{
+	return machine && !strcmp(perf_env__arch(machine->env), arch);
+}
+
 int machine__nr_cpus_avail(struct machine *machine)
 {
 	return machine ? perf_env__nr_cpus_avail(machine->env) : 0;
diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
index a143087eeb47..c5a45dc8df4c 100644
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -208,6 +208,7 @@ static inline bool machine__is_host(struct machine *machine)
 }
 
 bool machine__is(struct machine *machine, const char *arch);
+bool machine__normalized_is(struct machine *machine, const char *arch);
 int machine__nr_cpus_avail(struct machine *machine);
 
 struct thread *__machine__findnew_thread(struct machine *machine, pid_t pid, pid_t tid);
-- 
cgit v1.2.3


From 1a1a0b0364ad291bd8e509da104ac8b5b1afec5d Mon Sep 17 00:00:00 2001
From: Paul Chaignon <paul@isovalent.com>
Date: Mon, 20 Dec 2021 22:45:28 +0100
Subject: bpftool: Enable line buffering for stdout

The output of bpftool prog tracelog is currently buffered, which is
inconvenient when piping the output into other commands. A simple
tracelog | grep will typically not display anything. This patch fixes it
by enabling line buffering on stdout for the whole bpftool binary.

Fixes: 30da46b5dc3a ("tools: bpftool: add a command to dump the trace pipe")
Signed-off-by: Quentin Monnet <quentin@isovalent.com>
Signed-off-by: Paul Chaignon <paul@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20211220214528.GA11706@Mem
---
 tools/bpf/bpftool/main.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
index 8b71500e7cb2..020e91a542d5 100644
--- a/tools/bpf/bpftool/main.c
+++ b/tools/bpf/bpftool/main.c
@@ -408,6 +408,8 @@ int main(int argc, char **argv)
 	bool version_requested = false;
 	int opt, ret;
 
+	setlinebuf(stdout);
+
 	last_do_help = do_help;
 	pretty_output = false;
 	json_output = false;
-- 
cgit v1.2.3


From 7fbddf40b881a2430daf1bd03ba80e871a2fadce Mon Sep 17 00:00:00 2001
From: Kajol Jain <kjain@linux.ibm.com>
Date: Mon, 6 Dec 2021 14:47:47 +0530
Subject: tools headers UAPI: Add new macros for mem_hops field to perf_event.h

Add new macros for mem_hops field which can be used to represent
remote-node, socket and board level details.

Currently the code had macro for HOPS_0 which, corresponds to data
coming from another core but same node.  Add new macros for HOPS_1 to
HOPS_3 to represent remote-node, socket and board level data.

Also add corresponding strings in the mem_hops array to represent
mem_hop field data in perf_mem__lvl_scnprintf function

Incase mem_hops field is used, PERF_MEM_LVLNUM field also need to be set
inorder to represent the data source. Hence printing data source via
PERF_MEM_LVL field can be skip in that scenario.

For ex: Encodings for mem_hops fields with L2 cache:

  L2                      - local L2
  L2 | REMOTE | HOPS_0    - remote core, same node L2
  L2 | REMOTE | HOPS_1    - remote node, same socket L2
  L2 | REMOTE | HOPS_2    - remote socket, same board L2
  L2 | REMOTE | HOPS_3    - remote board L2

Signed-off-by: Kajol Jain <kjain@linux.ibm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nageswara R Sastry <rnsastry@linux.ibm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lore.kernel.org/lkml/20211206091749.87585-3-kjain@linux.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/include/uapi/linux/perf_event.h |  5 ++++-
 tools/perf/util/mem-events.c          | 29 ++++++++++++++++++-----------
 2 files changed, 22 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index bd8860eeb291..4cd39aaccbe7 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -1332,7 +1332,10 @@ union perf_mem_data_src {
 
 /* hop level */
 #define PERF_MEM_HOPS_0		0x01 /* remote core, same node */
-/* 2-7 available */
+#define PERF_MEM_HOPS_1         0x02 /* remote node, same socket */
+#define PERF_MEM_HOPS_2         0x03 /* remote socket, same board */
+#define PERF_MEM_HOPS_3         0x04 /* remote board */
+/* 5-7 available */
 #define PERF_MEM_HOPS_SHIFT	43
 
 #define PERF_MEM_S(a, s) \
diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c
index 3167b4628b6d..ed0ab838bcc5 100644
--- a/tools/perf/util/mem-events.c
+++ b/tools/perf/util/mem-events.c
@@ -309,6 +309,9 @@ static const char * const mem_hops[] = {
 	 * to be set with mem_hops field.
 	 */
 	"core, same node",
+	"node, same socket",
+	"socket, same board",
+	"board",
 };
 
 int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
@@ -316,7 +319,7 @@ int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
 	size_t i, l = 0;
 	u64 m =  PERF_MEM_LVL_NA;
 	u64 hit, miss;
-	int printed;
+	int printed = 0;
 
 	if (mem_info)
 		m  = mem_info->data_src.mem_lvl;
@@ -335,18 +338,22 @@ int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
 		l += 7;
 	}
 
-	if (mem_info && mem_info->data_src.mem_hops)
+	/*
+	 * Incase mem_hops field is set, we can skip printing data source via
+	 * PERF_MEM_LVL namespace.
+	 */
+	if (mem_info && mem_info->data_src.mem_hops) {
 		l += scnprintf(out + l, sz - l, "%s ", mem_hops[mem_info->data_src.mem_hops]);
-
-	printed = 0;
-	for (i = 0; m && i < ARRAY_SIZE(mem_lvl); i++, m >>= 1) {
-		if (!(m & 0x1))
-			continue;
-		if (printed++) {
-			strcat(out, " or ");
-			l += 4;
+	} else {
+		for (i = 0; m && i < ARRAY_SIZE(mem_lvl); i++, m >>= 1) {
+			if (!(m & 0x1))
+				continue;
+			if (printed++) {
+				strcat(out, " or ");
+				l += 4;
+			}
+			l += scnprintf(out + l, sz - l, mem_lvl[i]);
 		}
-		l += scnprintf(out + l, sz - l, mem_lvl[i]);
 	}
 
 	if (mem_info && mem_info->data_src.mem_lvl_num) {
-- 
cgit v1.2.3


From 878aed8db324bec64f3c3f956e64d5ae7375a5de Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 17 Dec 2021 11:29:57 +0100
Subject: netfilter: nat: force port remap to prevent shadowing well-known
 ports

If destination port is above 32k and source port below 16k
assume this might cause 'port shadowing' where a 'new' inbound
connection matches an existing one, e.g.

inbound X:41234 -> Y:53 matches existing conntrack entry
        Z:53 -> X:4123, where Z got natted to X.

In this case, new packet is natted to Z:53 which is likely
unwanted.

We avoid the rewrite for connections that originate from local host:
port-shadowing is only possible with forwarded connections.

Also adjust test case.

v3: no need to call tuple_force_port_remap if already in random mode (Phil)

Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Phil Sutter <phil@nwl.cc>
Acked-by: Eric Garver <eric@garver.life>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_nat_core.c                  | 43 ++++++++++++++++++++++++++--
 tools/testing/selftests/netfilter/nft_nat.sh |  5 ++--
 2 files changed, 43 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index ab9f6c75524d..3dd130487b5b 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -494,6 +494,38 @@ another_round:
 	goto another_round;
 }
 
+static bool tuple_force_port_remap(const struct nf_conntrack_tuple *tuple)
+{
+	u16 sp, dp;
+
+	switch (tuple->dst.protonum) {
+	case IPPROTO_TCP:
+		sp = ntohs(tuple->src.u.tcp.port);
+		dp = ntohs(tuple->dst.u.tcp.port);
+		break;
+	case IPPROTO_UDP:
+	case IPPROTO_UDPLITE:
+		sp = ntohs(tuple->src.u.udp.port);
+		dp = ntohs(tuple->dst.u.udp.port);
+		break;
+	default:
+		return false;
+	}
+
+	/* IANA: System port range: 1-1023,
+	 *         user port range: 1024-49151,
+	 *      private port range: 49152-65535.
+	 *
+	 * Linux default ephemeral port range is 32768-60999.
+	 *
+	 * Enforce port remapping if sport is significantly lower
+	 * than dport to prevent NAT port shadowing, i.e.
+	 * accidental match of 'new' inbound connection vs.
+	 * existing outbound one.
+	 */
+	return sp < 16384 && dp >= 32768;
+}
+
 /* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING,
  * we change the source to map into the range. For NF_INET_PRE_ROUTING
  * and NF_INET_LOCAL_OUT, we change the destination to map into the
@@ -507,11 +539,17 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
 		 struct nf_conn *ct,
 		 enum nf_nat_manip_type maniptype)
 {
+	bool random_port = range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL;
 	const struct nf_conntrack_zone *zone;
 	struct net *net = nf_ct_net(ct);
 
 	zone = nf_ct_zone(ct);
 
+	if (maniptype == NF_NAT_MANIP_SRC &&
+	    !random_port &&
+	    !ct->local_origin)
+		random_port = tuple_force_port_remap(orig_tuple);
+
 	/* 1) If this srcip/proto/src-proto-part is currently mapped,
 	 * and that same mapping gives a unique tuple within the given
 	 * range, use that.
@@ -520,8 +558,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
 	 * So far, we don't do local source mappings, so multiple
 	 * manips not an issue.
 	 */
-	if (maniptype == NF_NAT_MANIP_SRC &&
-	    !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
+	if (maniptype == NF_NAT_MANIP_SRC && !random_port) {
 		/* try the original tuple first */
 		if (in_range(orig_tuple, range)) {
 			if (!nf_nat_used_tuple(orig_tuple, ct)) {
@@ -545,7 +582,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
 	 */
 
 	/* Only bother mapping if it's not already in range and unique */
-	if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
+	if (!random_port) {
 		if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
 			if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) &&
 			    l4proto_in_range(tuple, maniptype,
diff --git a/tools/testing/selftests/netfilter/nft_nat.sh b/tools/testing/selftests/netfilter/nft_nat.sh
index d88867d2fed7..349a319a9e51 100755
--- a/tools/testing/selftests/netfilter/nft_nat.sh
+++ b/tools/testing/selftests/netfilter/nft_nat.sh
@@ -880,8 +880,9 @@ EOF
 		return $ksft_skip
 	fi
 
-	# test default behaviour. Packet from ns1 to ns0 is redirected to ns2.
-	test_port_shadow "default" "CLIENT"
+	# test default behaviour. Packet from ns1 to ns0 is not redirected
+	# due to automatic port translation.
+	test_port_shadow "default" "ROUTER"
 
 	# test packet filter based mitigation: prevent forwarding of
 	# packets claiming to come from the service port.
-- 
cgit v1.2.3


From 0cd0b1f7a6e4e9b3b3ce62d6130af60259e103cd Mon Sep 17 00:00:00 2001
From: Amit Cohen <amcohen@nvidia.com>
Date: Tue, 21 Dec 2021 16:49:43 +0200
Subject: selftests: lib.sh: Add PING_COUNT to allow sending configurable
 amount of packets

Currently `ping_do()` and `ping6_do()` send 10 packets.

There are cases that it is not possible to catch only the interesting
packets using tc rule, so then, it is possible to send many packets and
verify that at least this amount of packets hit the rule.

Add `PING_COUNT` variable, which is set to 10 by default, to allow tests
sending more than 10 packets using the existing ping API.

Signed-off-by: Amit Cohen <amcohen@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/forwarding/lib.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index dfd827b7a9f9..7da783d6f453 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -20,6 +20,7 @@ NETIF_TYPE=${NETIF_TYPE:=veth}
 NETIF_CREATE=${NETIF_CREATE:=yes}
 MCD=${MCD:=smcrouted}
 MC_CLI=${MC_CLI:=smcroutectl}
+PING_COUNT=${PING_COUNT:=10}
 PING_TIMEOUT=${PING_TIMEOUT:=5}
 WAIT_TIMEOUT=${WAIT_TIMEOUT:=20}
 INTERFACE_TIMEOUT=${INTERFACE_TIMEOUT:=600}
@@ -1111,7 +1112,8 @@ ping_do()
 
 	vrf_name=$(master_name_get $if_name)
 	ip vrf exec $vrf_name \
-		$PING $args $dip -c 10 -i 0.1 -w $PING_TIMEOUT &> /dev/null
+		$PING $args $dip -c $PING_COUNT -i 0.1 \
+		-w $PING_TIMEOUT &> /dev/null
 }
 
 ping_test()
@@ -1132,7 +1134,8 @@ ping6_do()
 
 	vrf_name=$(master_name_get $if_name)
 	ip vrf exec $vrf_name \
-		$PING6 $args $dip -c 10 -i 0.1 -w $PING_TIMEOUT &> /dev/null
+		$PING6 $args $dip -c $PING_COUNT -i 0.1 \
+		-w $PING_TIMEOUT &> /dev/null
 }
 
 ping6_test()
-- 
cgit v1.2.3


From b07e9957f2207aa5965abe55578d6efb0d88460d Mon Sep 17 00:00:00 2001
From: Amit Cohen <amcohen@nvidia.com>
Date: Tue, 21 Dec 2021 16:49:44 +0200
Subject: selftests: forwarding: Add VxLAN tests with a VLAN-unaware bridge for
 IPv6

Add tests similar to vxlan_bridge_1d.sh and vxlan_bridge_1d_port_8472.sh.

The tests set up a topology with three VxLAN endpoints: one
"local", possibly offloaded, and two "remote", formed using veth pairs
and likely purely software bridges. The "local" endpoint is connected to
host systems by a VLAN-unaware bridge.

Since VxLAN tunnels must be unique per namespace, each of the "remote"
endpoints is in its own namespace. H3 forms the bridge between the three
domains.

Send IPv4 packets and IPv6 packets with IPv6 underlay.
Use `TC_FLAG`, which is defined in `forwarding.config` file, for TC
checks. `TC_FLAG` allows testing that on HW datapath, the traffic
actually goes through HW.

Signed-off-by: Amit Cohen <amcohen@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../net/forwarding/vxlan_bridge_1d_ipv6.sh         | 804 +++++++++++++++++++++
 .../forwarding/vxlan_bridge_1d_port_8472_ipv6.sh   |  11 +
 2 files changed, 815 insertions(+)
 create mode 100755 tools/testing/selftests/net/forwarding/vxlan_bridge_1d_ipv6.sh
 create mode 100755 tools/testing/selftests/net/forwarding/vxlan_bridge_1d_port_8472_ipv6.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d_ipv6.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d_ipv6.sh
new file mode 100755
index 000000000000..ac97f07e5ce8
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d_ipv6.sh
@@ -0,0 +1,804 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# +-----------------------+                          +------------------------+
+# | H1 (vrf)              |                          | H2 (vrf)               |
+# |    + $h1              |                          |    + $h2               |
+# |    | 192.0.2.1/28     |                          |    | 192.0.2.2/28      |
+# |    | 2001:db8:1::1/64 |                          |    | 2001:db8:1::2/64  |
+# +----|------------------+                          +----|-------------------+
+#      |                                                  |
+# +----|--------------------------------------------------|-------------------+
+# | SW |                                                  |                   |
+# | +--|--------------------------------------------------|-----------------+ |
+# | |  + $swp1                   BR1 (802.1d)             + $swp2           | |
+# | |                                                                       | |
+# | |  + vx1 (vxlan)                                                        | |
+# | |    local 2001:db8:3::1                                                | |
+# | |    remote 2001:db8:4::1 2001:db8:5::1                                 | |
+# | |    id 1000 dstport $VXPORT                                            | |
+# | +-----------------------------------------------------------------------+ |
+# |                                                                           |
+# |  2001:db8:4::0/64 via 2001:db8:3::2                                       |
+# |  2001:db8:5::0/64 via 2001:db8:3::2                                       |
+# |                                                                           |
+# |    + $rp1                                                                 |
+# |    | 2001:db8:3::1/64                                                     |
+# +----|----------------------------------------------------------------------+
+#      |
+# +----|----------------------------------------------------------+
+# |    |                                             VRP2 (vrf)   |
+# |    + $rp2                                                     |
+# |      2001:db8:3::2/64                                         |
+# |                                                               |  (maybe) HW
+# =============================================================================
+# |                                                               |  (likely) SW
+# |    + v1 (veth)                             + v3 (veth)        |
+# |    | 2001:db8:4::2/64                      | 2001:db8:5::2/64 |
+# +----|---------------------------------------|------------------+
+#      |                                       |
+# +----|--------------------------------+ +----|-------------------------------+
+# |    + v2 (veth)        NS1 (netns)   | |    + v4 (veth)        NS2 (netns)  |
+# |      2001:db8:4::1/64               | |      2001:db8:5::1/64              |
+# |                                     | |                                    |
+# | 2001:db8:3::0/64 via 2001:db8:4::2  | | 2001:db8:3::0/64 via 2001:db8:5::2 |
+# | 2001:db8:5::1/128 via 2001:db8:4::2 | | 2001:db8:4::1/128 via              |
+# |                                     | |         2001:db8:5::2              |
+# |                                     | |                                    |
+# | +-------------------------------+   | | +-------------------------------+  |
+# | |                  BR2 (802.1d) |   | | |                  BR2 (802.1d) |  |
+# | |  + vx2 (vxlan)                |   | | |  + vx2 (vxlan)                |  |
+# | |    local 2001:db8:4::1        |   | | |    local 2001:db8:5::1        |  |
+# | |    remote 2001:db8:3::1       |   | | |    remote 2001:db8:3::1       |  |
+# | |    remote 2001:db8:5::1       |   | | |    remote 2001:db8:4::1       |  |
+# | |    id 1000 dstport $VXPORT    |   | | |    id 1000 dstport $VXPORT    |  |
+# | |                               |   | | |                               |  |
+# | |  + w1 (veth)                  |   | | |  + w1 (veth)                  |  |
+# | +--|----------------------------+   | | +--|----------------------------+  |
+# |    |                                | |    |                               |
+# | +--|----------------------------+   | | +--|----------------------------+  |
+# | |  + w2 (veth)        VW2 (vrf) |   | | |  + w2 (veth)        VW2 (vrf) |  |
+# | |    192.0.2.3/28               |   | | |    192.0.2.4/28               |  |
+# | |    2001:db8:1::3/64           |   | | |    2001:db8:1::4/64           |  |
+# | +-------------------------------+   | | +-------------------------------+  |
+# +-------------------------------------+ +------------------------------------+
+
+: ${VXPORT:=4789}
+export VXPORT
+
+: ${ALL_TESTS:="
+	ping_ipv4
+	ping_ipv6
+	test_flood
+	test_unicast
+	test_ttl
+	test_tos
+	test_ecn_encap
+	test_ecn_decap
+	reapply_config
+	ping_ipv4
+	ping_ipv6
+	test_flood
+	test_unicast
+"}
+
+NUM_NETIFS=6
+source lib.sh
+source tc_common.sh
+
+h1_create()
+{
+	simple_if_init $h1 192.0.2.1/28 2001:db8:1::1/64
+	tc qdisc add dev $h1 clsact
+}
+
+h1_destroy()
+{
+	tc qdisc del dev $h1 clsact
+	simple_if_fini $h1 192.0.2.1/28 2001:db8:1::1/64
+}
+
+h2_create()
+{
+	simple_if_init $h2 192.0.2.2/28 2001:db8:1::2/64
+	tc qdisc add dev $h2 clsact
+}
+
+h2_destroy()
+{
+	tc qdisc del dev $h2 clsact
+	simple_if_fini $h2 192.0.2.2/28 2001:db8:1::2/64
+}
+
+rp1_set_addr()
+{
+	ip address add dev $rp1 2001:db8:3::1/64
+
+	ip route add 2001:db8:4::0/64 nexthop via 2001:db8:3::2
+	ip route add 2001:db8:5::0/64 nexthop via 2001:db8:3::2
+}
+
+rp1_unset_addr()
+{
+	ip route del 2001:db8:5::0/64 nexthop via 2001:db8:3::2
+	ip route del 2001:db8:4::0/64 nexthop via 2001:db8:3::2
+
+	ip address del dev $rp1 2001:db8:3::1/64
+}
+
+switch_create()
+{
+	ip link add name br1 type bridge vlan_filtering 0 mcast_snooping 0
+	# Make sure the bridge uses the MAC address of the local port and not
+	# that of the VxLAN's device.
+	ip link set dev br1 address $(mac_get $swp1)
+	ip link set dev br1 up
+
+	ip link set dev $rp1 up
+	rp1_set_addr
+	tc qdisc add dev $rp1 clsact
+
+	ip link add name vx1 type vxlan id 1000	local 2001:db8:3::1 \
+		dstport "$VXPORT" nolearning udp6zerocsumrx udp6zerocsumtx \
+		tos inherit ttl 100
+	ip link set dev vx1 up
+
+	ip link set dev vx1 master br1
+	ip link set dev $swp1 master br1
+	ip link set dev $swp1 up
+	tc qdisc add dev $swp1 clsact
+
+	ip link set dev $swp2 master br1
+	ip link set dev $swp2 up
+
+	bridge fdb append dev vx1 00:00:00:00:00:00 dst 2001:db8:4::1 self
+	bridge fdb append dev vx1 00:00:00:00:00:00 dst 2001:db8:5::1 self
+}
+
+switch_destroy()
+{
+	bridge fdb del dev vx1 00:00:00:00:00:00 dst 2001:db8:5::1 self
+	bridge fdb del dev vx1 00:00:00:00:00:00 dst 2001:db8:4::1 self
+
+	ip link set dev $swp2 down
+	ip link set dev $swp2 nomaster
+
+	tc qdisc del dev $swp1 clsact
+	ip link set dev $swp1 down
+	ip link set dev $swp1 nomaster
+
+	ip link set dev vx1 nomaster
+	ip link set dev vx1 down
+	ip link del dev vx1
+
+	tc qdisc del dev $rp1 clsact
+	rp1_unset_addr
+	ip link set dev $rp1 down
+
+	ip link set dev br1 down
+	ip link del dev br1
+}
+
+vrp2_create()
+{
+	simple_if_init $rp2 2001:db8:3::2/64
+	__simple_if_init v1 v$rp2 2001:db8:4::2/64
+	__simple_if_init v3 v$rp2 2001:db8:5::2/64
+	tc qdisc add dev v1 clsact
+}
+
+vrp2_destroy()
+{
+	tc qdisc del dev v1 clsact
+	__simple_if_fini v3 2001:db8:5::2/64
+	__simple_if_fini v1 2001:db8:4::2/64
+	simple_if_fini $rp2 2001:db8:3::2/64
+}
+
+ns_init_common()
+{
+	local in_if=$1; shift
+	local in_addr=$1; shift
+	local other_in_addr=$1; shift
+	local nh_addr=$1; shift
+	local host_addr_ipv4=$1; shift
+	local host_addr_ipv6=$1; shift
+
+	ip link set dev $in_if up
+	ip address add dev $in_if $in_addr/64
+	tc qdisc add dev $in_if clsact
+
+	ip link add name br2 type bridge vlan_filtering 0
+	ip link set dev br2 up
+
+	ip link add name w1 type veth peer name w2
+
+	ip link set dev w1 master br2
+	ip link set dev w1 up
+
+	ip link add name vx2 type vxlan id 1000 local $in_addr \
+		dstport "$VXPORT" udp6zerocsumrx
+	ip link set dev vx2 up
+	bridge fdb append dev vx2 00:00:00:00:00:00 dst 2001:db8:3::1 self
+	bridge fdb append dev vx2 00:00:00:00:00:00 dst $other_in_addr self
+
+	ip link set dev vx2 master br2
+	tc qdisc add dev vx2 clsact
+
+	simple_if_init w2 $host_addr_ipv4/28 $host_addr_ipv6/64
+
+	ip route add 2001:db8:3::0/64 nexthop via $nh_addr
+	ip route add $other_in_addr/128 nexthop via $nh_addr
+}
+export -f ns_init_common
+
+ns1_create()
+{
+	ip netns add ns1
+	ip link set dev v2 netns ns1
+	in_ns ns1 \
+	      ns_init_common v2 2001:db8:4::1 2001:db8:5::1 2001:db8:4::2 \
+	      192.0.2.3 2001:db8:1::3
+}
+
+ns1_destroy()
+{
+	ip netns exec ns1 ip link set dev v2 netns 1
+	ip netns del ns1
+}
+
+ns2_create()
+{
+	ip netns add ns2
+	ip link set dev v4 netns ns2
+	in_ns ns2 \
+	      ns_init_common v4 2001:db8:5::1 2001:db8:4::1 2001:db8:5::2 \
+	      192.0.2.4 2001:db8:1::4
+}
+
+ns2_destroy()
+{
+	ip netns exec ns2 ip link set dev v4 netns 1
+	ip netns del ns2
+}
+
+setup_prepare()
+{
+	h1=${NETIFS[p1]}
+	swp1=${NETIFS[p2]}
+
+	swp2=${NETIFS[p3]}
+	h2=${NETIFS[p4]}
+
+	rp1=${NETIFS[p5]}
+	rp2=${NETIFS[p6]}
+
+	vrf_prepare
+	forwarding_enable
+
+	h1_create
+	h2_create
+	switch_create
+
+	ip link add name v1 type veth peer name v2
+	ip link add name v3 type veth peer name v4
+	vrp2_create
+	ns1_create
+	ns2_create
+
+	r1_mac=$(in_ns ns1 mac_get w2)
+	r2_mac=$(in_ns ns2 mac_get w2)
+	h2_mac=$(mac_get $h2)
+}
+
+cleanup()
+{
+	pre_cleanup
+
+	ns2_destroy
+	ns1_destroy
+	vrp2_destroy
+	ip link del dev v3
+	ip link del dev v1
+
+	switch_destroy
+	h2_destroy
+	h1_destroy
+
+	forwarding_restore
+	vrf_cleanup
+}
+
+# For the first round of tests, vx1 is the first device to get
+# attached to the bridge, and at that point the local IP is already
+# configured. Try the other scenario of attaching the devices to a an
+# already-offloaded bridge, and only then assign the local IP.
+reapply_config()
+{
+	log_info "Reapplying configuration"
+
+	bridge fdb del dev vx1 00:00:00:00:00:00 dst 2001:db8:5::1 self
+	bridge fdb del dev vx1 00:00:00:00:00:00 dst 2001:db8:4::1 self
+	ip link set dev vx1 nomaster
+	rp1_unset_addr
+	sleep 5
+
+	ip link set dev vx1 master br1
+	bridge fdb append dev vx1 00:00:00:00:00:00 dst 2001:db8:4::1 self
+	bridge fdb append dev vx1 00:00:00:00:00:00 dst 2001:db8:5::1 self
+	sleep 1
+	rp1_set_addr
+	sleep 5
+}
+
+__ping_ipv4()
+{
+	local vxlan_local_ip=$1; shift
+	local vxlan_remote_ip=$1; shift
+	local src_ip=$1; shift
+	local dst_ip=$1; shift
+	local dev=$1; shift
+	local info=$1; shift
+
+	RET=0
+
+	tc filter add dev $rp1 egress protocol ipv6 pref 1 handle 101 \
+		flower ip_proto udp src_ip $vxlan_local_ip \
+		dst_ip $vxlan_remote_ip dst_port $VXPORT $TC_FLAG action pass
+	# Match ICMP-reply packets after decapsulation, so source IP is
+	# destination IP of the ping and destination IP is source IP of the
+	# ping.
+	tc filter add dev $swp1 egress protocol ip pref 1 handle 101 \
+		flower src_ip $dst_ip dst_ip $src_ip \
+		$TC_FLAG action pass
+
+	# Send 100 packets and verify that at least 100 packets hit the rule,
+	# to overcome ARP noise.
+	PING_COUNT=100 PING_TIMEOUT=11 ping_do $dev $dst_ip
+	check_err $? "Ping failed"
+
+	tc_check_at_least_x_packets "dev $rp1 egress" 101 10 100
+	check_err $? "Encapsulated packets did not go through router"
+
+	tc_check_at_least_x_packets "dev $swp1 egress" 101 10 100
+	check_err $? "Decapsulated packets did not go through switch"
+
+	log_test "ping: $info"
+
+	tc filter del dev $swp1 egress
+	tc filter del dev $rp1 egress
+}
+
+ping_ipv4()
+{
+	RET=0
+
+	local local_sw_ip=2001:db8:3::1
+	local remote_ns1_ip=2001:db8:4::1
+	local remote_ns2_ip=2001:db8:5::1
+	local h1_ip=192.0.2.1
+	local w2_ns1_ip=192.0.2.3
+	local w2_ns2_ip=192.0.2.4
+
+	ping_test $h1 192.0.2.2 ": local->local"
+
+	__ping_ipv4 $local_sw_ip $remote_ns1_ip $h1_ip $w2_ns1_ip $h1 \
+		"local->remote 1"
+	__ping_ipv4 $local_sw_ip $remote_ns2_ip $h1_ip $w2_ns2_ip $h1 \
+		"local->remote 2"
+}
+
+__ping_ipv6()
+{
+	local vxlan_local_ip=$1; shift
+	local vxlan_remote_ip=$1; shift
+	local src_ip=$1; shift
+	local dst_ip=$1; shift
+	local dev=$1; shift
+	local info=$1; shift
+
+	RET=0
+
+	tc filter add dev $rp1 egress protocol ipv6 pref 1 handle 101 \
+		flower ip_proto udp src_ip $vxlan_local_ip \
+		dst_ip $vxlan_remote_ip dst_port $VXPORT $TC_FLAG action pass
+	# Match ICMP-reply packets after decapsulation, so source IP is
+	# destination IP of the ping and destination IP is source IP of the
+	# ping.
+	tc filter add dev $swp1 egress protocol ipv6 pref 1 handle 101 \
+		flower src_ip $dst_ip dst_ip $src_ip $TC_FLAG action pass
+
+	# Send 100 packets and verify that at least 100 packets hit the rule,
+	# to overcome neighbor discovery noise.
+	PING_COUNT=100 PING_TIMEOUT=11 ping6_do $dev $dst_ip
+	check_err $? "Ping failed"
+
+	tc_check_at_least_x_packets "dev $rp1 egress" 101 100
+	check_err $? "Encapsulated packets did not go through router"
+
+	tc_check_at_least_x_packets "dev $swp1 egress" 101 100
+	check_err $? "Decapsulated packets did not go through switch"
+
+	log_test "ping6: $info"
+
+	tc filter del dev $swp1 egress
+	tc filter del dev $rp1 egress
+}
+
+ping_ipv6()
+{
+	RET=0
+
+	local local_sw_ip=2001:db8:3::1
+	local remote_ns1_ip=2001:db8:4::1
+	local remote_ns2_ip=2001:db8:5::1
+	local h1_ip=2001:db8:1::1
+	local w2_ns1_ip=2001:db8:1::3
+	local w2_ns2_ip=2001:db8:1::4
+
+	ping6_test $h1 2001:db8:1::2 ": local->local"
+
+	__ping_ipv6 $local_sw_ip $remote_ns1_ip $h1_ip $w2_ns1_ip $h1 \
+		"local->remote 1"
+	__ping_ipv6 $local_sw_ip $remote_ns2_ip $h1_ip $w2_ns2_ip $h1 \
+		"local->remote 2"
+}
+
+maybe_in_ns()
+{
+	echo ${1:+in_ns} $1
+}
+
+__flood_counter_add_del()
+{
+	local add_del=$1; shift
+	local dst_ip=$1; shift
+	local dev=$1; shift
+	local ns=$1; shift
+
+	# Putting the ICMP capture both to HW and to SW will end up
+	# double-counting the packets that are trapped to slow path, such as for
+	# the unicast test. Adding either skip_hw or skip_sw fixes this problem,
+	# but with skip_hw, the flooded packets are not counted at all, because
+	# those are dropped due to MAC address mismatch; and skip_sw is a no-go
+	# for veth-based topologies.
+	#
+	# So try to install with skip_sw and fall back to skip_sw if that fails.
+
+	$(maybe_in_ns $ns) tc filter $add_del dev "$dev" ingress \
+	   proto ipv6 pref 100 flower dst_ip $dst_ip ip_proto \
+	   icmpv6 skip_sw action pass 2>/dev/null || \
+	$(maybe_in_ns $ns) tc filter $add_del dev "$dev" ingress \
+	   proto ipv6 pref 100 flower dst_ip $dst_ip ip_proto \
+	   icmpv6 skip_hw action pass
+}
+
+flood_counter_install()
+{
+	__flood_counter_add_del add "$@"
+}
+
+flood_counter_uninstall()
+{
+	__flood_counter_add_del del "$@"
+}
+
+flood_fetch_stat()
+{
+	local dev=$1; shift
+	local ns=$1; shift
+
+	$(maybe_in_ns $ns) tc_rule_stats_get $dev 100 ingress
+}
+
+flood_fetch_stats()
+{
+	local counters=("${@}")
+	local counter
+
+	for counter in "${counters[@]}"; do
+		flood_fetch_stat $counter
+	done
+}
+
+vxlan_flood_test()
+{
+	local mac=$1; shift
+	local dst=$1; shift
+	local -a expects=("${@}")
+
+	local -a counters=($h2 "vx2 ns1" "vx2 ns2")
+	local counter
+	local key
+
+	for counter in "${counters[@]}"; do
+		flood_counter_install $dst $counter
+	done
+
+	local -a t0s=($(flood_fetch_stats "${counters[@]}"))
+	$MZ -6 $h1 -c 10 -d 100msec -p 64 -b $mac -B $dst -t icmp6 type=128 -q
+	sleep 1
+	local -a t1s=($(flood_fetch_stats "${counters[@]}"))
+
+	for key in ${!t0s[@]}; do
+		local delta=$((t1s[$key] - t0s[$key]))
+		local expect=${expects[$key]}
+
+		((expect == delta))
+		check_err $? "${counters[$key]}: Expected to capture $expect packets, got $delta."
+	done
+
+	for counter in "${counters[@]}"; do
+		flood_counter_uninstall $dst $counter
+	done
+}
+
+__test_flood()
+{
+	local mac=$1; shift
+	local dst=$1; shift
+	local what=$1; shift
+
+	RET=0
+
+	vxlan_flood_test $mac $dst 10 10 10
+
+	log_test "VXLAN: $what"
+}
+
+test_flood()
+{
+	__test_flood de:ad:be:ef:13:37 2001:db8:1::100 "flood"
+}
+
+vxlan_fdb_add_del()
+{
+	local add_del=$1; shift
+	local mac=$1; shift
+	local dev=$1; shift
+	local dst=$1; shift
+
+	bridge fdb $add_del dev $dev $mac self static permanent \
+		${dst:+dst} $dst 2>/dev/null
+	bridge fdb $add_del dev $dev $mac master static 2>/dev/null
+}
+
+__test_unicast()
+{
+	local mac=$1; shift
+	local dst=$1; shift
+	local hit_idx=$1; shift
+	local what=$1; shift
+
+	RET=0
+
+	local -a expects=(0 0 0)
+	expects[$hit_idx]=10
+
+	vxlan_flood_test $mac $dst "${expects[@]}"
+
+	log_test "VXLAN: $what"
+}
+
+test_unicast()
+{
+	local -a targets=("$h2_mac $h2"
+			  "$r1_mac vx1 2001:db8:4::1"
+			  "$r2_mac vx1 2001:db8:5::1")
+	local target
+
+	for target in "${targets[@]}"; do
+		vxlan_fdb_add_del add $target
+	done
+
+	__test_unicast $h2_mac 2001:db8:1::2 0 "local MAC unicast"
+	__test_unicast $r1_mac 2001:db8:1::3 1 "remote MAC 1 unicast"
+	__test_unicast $r2_mac 2001:db8:1::4 2 "remote MAC 2 unicast"
+
+	for target in "${targets[@]}"; do
+		vxlan_fdb_add_del del $target
+	done
+}
+
+vxlan_ping_test()
+{
+	local ping_dev=$1; shift
+	local ping_dip=$1; shift
+	local ping_args=$1; shift
+	local capture_dev=$1; shift
+	local capture_dir=$1; shift
+	local capture_pref=$1; shift
+	local expect=$1; shift
+
+	local t0=$(tc_rule_stats_get $capture_dev $capture_pref $capture_dir)
+	ping6_do $ping_dev $ping_dip "$ping_args"
+	local t1=$(tc_rule_stats_get $capture_dev $capture_pref $capture_dir)
+	local delta=$((t1 - t0))
+
+	# Tolerate a couple stray extra packets.
+	((expect <= delta && delta <= expect + 2))
+	check_err $? "$capture_dev: Expected to capture $expect packets, got $delta."
+}
+
+test_ttl()
+{
+	RET=0
+
+	tc filter add dev v1 egress pref 77 protocol ipv6 \
+		flower ip_ttl 99 action pass
+	vxlan_ping_test $h1 2001:db8:1::3 "" v1 egress 77 10
+	tc filter del dev v1 egress pref 77 protocol ipv6
+
+	log_test "VXLAN: envelope TTL"
+}
+
+test_tos()
+{
+	RET=0
+
+	tc filter add dev v1 egress pref 77 protocol ipv6 \
+		flower ip_tos 0x14 action pass
+	vxlan_ping_test $h1 2001:db8:1::3 "-Q 0x14" v1 egress 77 10
+	vxlan_ping_test $h1 2001:db8:1::3 "-Q 0x18" v1 egress 77 0
+	tc filter del dev v1 egress pref 77 protocol ipv6
+
+	log_test "VXLAN: envelope TOS inheritance"
+}
+
+__test_ecn_encap()
+{
+	local q=$1; shift
+	local tos=$1; shift
+
+	RET=0
+
+	tc filter add dev v1 egress pref 77 protocol ipv6 \
+		flower ip_tos $tos action pass
+	sleep 1
+	vxlan_ping_test $h1 2001:db8:1::3 "-Q $q" v1 egress 77 10
+	tc filter del dev v1 egress pref 77 protocol ipv6
+
+	log_test "VXLAN: ECN encap: $q->$tos"
+}
+
+test_ecn_encap()
+{
+	# In accordance with INET_ECN_encapsulate()
+	__test_ecn_encap 0x00 0x00
+	__test_ecn_encap 0x01 0x01
+	__test_ecn_encap 0x02 0x02
+	__test_ecn_encap 0x03 0x02
+}
+
+vxlan_encapped_ping_do()
+{
+	local count=$1; shift
+	local dev=$1; shift
+	local next_hop_mac=$1; shift
+	local dest_ip=$1; shift
+	local dest_mac=$1; shift
+	local inner_tos=$1; shift
+	local outer_tos=$1; shift
+	local saddr="20:01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:03"
+	local daddr="20:01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:01"
+
+	$MZ -6 $dev -c $count -d 100msec -q \
+		-b $next_hop_mac -B $dest_ip \
+		-t udp tos=$outer_tos,sp=23456,dp=$VXPORT,p=$(:
+		    )"08:"$(                      : VXLAN flags
+		    )"00:00:00:"$(                : VXLAN reserved
+		    )"00:03:e8:"$(                : VXLAN VNI
+		    )"00:"$(                      : VXLAN reserved
+		    )"$dest_mac:"$(               : ETH daddr
+		    )"$(mac_get w2):"$(           : ETH saddr
+		    )"86:dd:"$(                   : ETH type
+		    )"6"$(			  : IP version
+		    )"$inner_tos"$(               : Traffic class
+		    )"0:00:00:"$(                 : Flow label
+		    )"00:08:"$(                   : Payload length
+		    )"3a:"$(                      : Next header
+		    )"04:"$(                      : Hop limit
+		    )"$saddr:"$(		  : IP saddr
+		    )"$daddr:"$(		  : IP daddr
+		    )"80:"$(			  : ICMPv6.type
+		    )"00:"$(			  : ICMPv6.code
+		    )"00:"$(			  : ICMPv6.checksum
+		    )
+}
+export -f vxlan_encapped_ping_do
+
+vxlan_encapped_ping_test()
+{
+	local ping_dev=$1; shift
+	local nh_dev=$1; shift
+	local ping_dip=$1; shift
+	local inner_tos=$1; shift
+	local outer_tos=$1; shift
+	local stat_get=$1; shift
+	local expect=$1; shift
+
+	local t0=$($stat_get)
+
+	in_ns ns1 \
+		vxlan_encapped_ping_do 10 $ping_dev $(mac_get $nh_dev) \
+			$ping_dip $(mac_get $h1) \
+			$inner_tos $outer_tos
+	sleep 1
+	local t1=$($stat_get)
+	local delta=$((t1 - t0))
+
+	# Tolerate a couple stray extra packets.
+	((expect <= delta && delta <= expect + 2))
+	check_err $? "Expected to capture $expect packets, got $delta."
+}
+export -f vxlan_encapped_ping_test
+
+__test_ecn_decap()
+{
+	local orig_inner_tos=$1; shift
+	local orig_outer_tos=$1; shift
+	local decapped_tos=$1; shift
+
+	RET=0
+
+	tc filter add dev $h1 ingress pref 77 protocol ipv6 \
+		flower src_ip 2001:db8:1::3 dst_ip 2001:db8:1::1 \
+		ip_tos $decapped_tos action drop
+	sleep 1
+	vxlan_encapped_ping_test v2 v1 2001:db8:3::1 \
+				 $orig_inner_tos $orig_outer_tos \
+				 "tc_rule_stats_get $h1 77 ingress" 10
+	tc filter del dev $h1 ingress pref 77
+
+	log_test "VXLAN: ECN decap: $orig_outer_tos/$orig_inner_tos->$decapped_tos"
+}
+
+test_ecn_decap_error()
+{
+	local orig_inner_tos="0:0"
+	local orig_outer_tos=03
+
+	RET=0
+
+	vxlan_encapped_ping_test v2 v1 2001:db8:3::1 \
+				 $orig_inner_tos $orig_outer_tos \
+				 "link_stats_rx_errors_get vx1" 10
+
+	log_test "VXLAN: ECN decap: $orig_outer_tos/$orig_inner_tos->error"
+}
+
+test_ecn_decap()
+{
+	# In accordance with INET_ECN_decapsulate()
+	__test_ecn_decap "0:0" 00 0x00
+	__test_ecn_decap "0:0" 01 0x00
+	__test_ecn_decap "0:0" 02 0x00
+	# 00 03 is tested in test_ecn_decap_error()
+	__test_ecn_decap "0:1" 00 0x01
+	__test_ecn_decap "0:1" 01 0x01
+	__test_ecn_decap "0:1" 02 0x01
+	__test_ecn_decap "0:1" 03 0x03
+	__test_ecn_decap "0:2" 00 0x02
+	__test_ecn_decap "0:2" 01 0x01
+	__test_ecn_decap "0:2" 02 0x02
+	__test_ecn_decap "0:2" 03 0x03
+	__test_ecn_decap "0:3" 00 0x03
+	__test_ecn_decap "0:3" 01 0x03
+	__test_ecn_decap "0:3" 02 0x03
+	__test_ecn_decap "0:3" 03 0x03
+	test_ecn_decap_error
+}
+
+test_all()
+{
+	log_info "Running tests with UDP port $VXPORT"
+	tests_run
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+test_all
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d_port_8472_ipv6.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d_port_8472_ipv6.sh
new file mode 100755
index 000000000000..00540317737a
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d_port_8472_ipv6.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# A wrapper to run VXLAN tests with an unusual port number.
+
+VXPORT=8472
+ALL_TESTS="
+	ping_ipv4
+	ping_ipv6
+"
+source vxlan_bridge_1d_ipv6.sh
-- 
cgit v1.2.3


From 728b35259e28b6e85b6dbae9caf944548cebc9c6 Mon Sep 17 00:00:00 2001
From: Amit Cohen <amcohen@nvidia.com>
Date: Tue, 21 Dec 2021 16:49:45 +0200
Subject: selftests: forwarding: Add VxLAN tests with a VLAN-aware bridge for
 IPv6

The tests are very similar to their VLAN-unaware counterpart
(vxlan_bridge_1d_ipv6.sh and vxlan_bridge_1d_port_8472_ipv6.sh),
but instead of using multiple VLAN-unaware bridges, a single VLAN-aware
bridge is used with multiple VLANs.

Signed-off-by: Amit Cohen <amcohen@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../net/forwarding/vxlan_bridge_1q_ipv6.sh         | 837 +++++++++++++++++++++
 .../forwarding/vxlan_bridge_1q_port_8472_ipv6.sh   |  11 +
 2 files changed, 848 insertions(+)
 create mode 100755 tools/testing/selftests/net/forwarding/vxlan_bridge_1q_ipv6.sh
 create mode 100755 tools/testing/selftests/net/forwarding/vxlan_bridge_1q_port_8472_ipv6.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1q_ipv6.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1q_ipv6.sh
new file mode 100755
index 000000000000..d880df89bc8b
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1q_ipv6.sh
@@ -0,0 +1,837 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# +-----------------------+                          +------------------------+
+# | H1 (vrf)              |                          | H2 (vrf)               |
+# | + $h1.10              |                          | + $h2.10               |
+# | | 192.0.2.1/28        |                          | | 192.0.2.2/28         |
+# | | 2001:db8:1::1/64    |                          | | 2001:db8:1::2/64     |
+# | |                     |                          | |                      |
+# | |  + $h1.20           |                          | |  + $h2.20            |
+# | \  | 198.51.100.1/24  |                          | \  | 198.51.100.2/24   |
+# |  \ | 2001:db8:2::1/64 |                          |  \ | 2001:db8:2::2/64  |
+# |   \|                  |                          |   \|                   |
+# |    + $h1              |                          |    + $h2               |
+# +----|------------------+                          +----|-------------------+
+#      |                                                  |
+# +----|--------------------------------------------------|-------------------+
+# | SW |                                                  |                   |
+# | +--|--------------------------------------------------|-----------------+ |
+# | |  + $swp1                   BR1 (802.1q)             + $swp2           | |
+# | |     vid 10                                             vid 10         | |
+# | |     vid 20                                             vid 20         | |
+# | |                                                                       | |
+# | |  + vx10 (vxlan)                        + vx20 (vxlan)                 | |
+# | |    local:                                local:                       | |
+# | |    2001:db8:3::1                         2001:db8:3::1                | |
+# | |    remote:                               remote:                      | |
+# | |    2001:db8:4::1 2001:db8:5::1           2001:db8:4::1 2001:db8:5::1  | |
+# | |    id 1000 dstport $VXPORT               id 2000 dstport $VXPORT      | |
+# | |    vid 10 pvid untagged                  vid 20 pvid untagged         | |
+# | +-----------------------------------------------------------------------+ |
+# |                                                                           |
+# |  2001:db8:4::0/64 via 2001:db8:3::2                                       |
+# |  2001:db8:5::0/64 via 2001:db8:3::2                                       |
+# |                                                                           |
+# |    + $rp1                                                                 |
+# |    | 2001:db8:3::1/64                                                     |
+# +----|----------------------------------------------------------------------+
+#      |
+# +----|----------------------------------------------------------+
+# |    |                                             VRP2 (vrf)   |
+# |    + $rp2                                                     |
+# |      2001:db8:3::2/64                                         |
+# |                                                               |  (maybe) HW
+# =============================================================================
+# |                                                               |  (likely) SW
+# |    + v1 (veth)                             + v3 (veth)        |
+# |    | 2001:db8:4::2/64                      | 2001:db8:5::2/64 |
+# +----|---------------------------------------|------------------+
+#      |                                       |
+# +----|--------------------------------+ +----|-------------------------------+
+# |    + v2 (veth)        NS1 (netns)   | |    + v4 (veth)        NS2 (netns)  |
+# |      2001:db8:4::1/64               | |      2001:db8:5::1/64              |
+# |                                     | |                                    |
+# | 2001:db8:3::0/64 via 2001:db8:4::2  | | 2001:db8:3::0/64 via 2001:db8:5::2 |
+# | 2001:db8:5::1/128 via 2001:db8:4::2 | | 2001:db8:4::1/128 via              |
+# |                                     | |         2001:db8:5::2              |
+# |                                     | |                                    |
+# | +-------------------------------+   | | +-------------------------------+  |
+# | |                  BR2 (802.1q) |   | | |                  BR2 (802.1q) |  |
+# | |  + vx10 (vxlan)               |   | | |  + vx10 (vxlan)               |  |
+# | |    local 2001:db8:4::1        |   | | |    local 2001:db8:5::1        |  |
+# | |    remote 2001:db8:3::1       |   | | |    remote 2001:db8:3::1       |  |
+# | |    remote 2001:db8:5::1       |   | | |    remote 2001:db8:4::1       |  |
+# | |    id 1000 dstport $VXPORT    |   | | |    id 1000 dstport $VXPORT    |  |
+# | |    vid 10 pvid untagged       |   | | |    vid 10 pvid untagged       |  |
+# | |                               |   | | |                               |  |
+# | |  + vx20 (vxlan)               |   | | |  + vx20 (vxlan)               |  |
+# | |    local 2001:db8:4::1        |   | | |    local 2001:db8:5::1        |  |
+# | |    remote 2001:db8:3::1       |   | | |    remote 2001:db8:3::1       |  |
+# | |    remote 2001:db8:5::1       |   | | |    remote 2001:db8:4::1       |  |
+# | |    id 2000 dstport $VXPORT    |   | | |    id 2000 dstport $VXPORT    |  |
+# | |    vid 20 pvid untagged       |   | | |    vid 20 pvid untagged       |  |
+# | |                               |   | | |                               |  |
+# | |  + w1 (veth)                  |   | | |  + w1 (veth)                  |  |
+# | |  | vid 10                     |   | | |  | vid 10                     |  |
+# | |  | vid 20                     |   | | |  | vid 20                     |  |
+# | +--|----------------------------+   | | +--|----------------------------+  |
+# |    |                                | |    |                               |
+# | +--|----------------------------+   | | +--|----------------------------+  |
+# | |  + w2 (veth)        VW2 (vrf) |   | | |  + w2 (veth)        VW2 (vrf) |  |
+# | |  |\                           |   | | |  |\                           |  |
+# | |  | + w2.10                    |   | | |  | + w2.10                    |  |
+# | |  |   192.0.2.3/28             |   | | |  |   192.0.2.4/28             |  |
+# | |  |   2001:db8:1::3/64         |   | | |  |   2001:db8:1::4/64         |  |
+# | |  |                            |   | | |  |                            |  |
+# | |  + w2.20                      |   | | |  + w2.20                      |  |
+# | |    198.51.100.3/24            |   | | |    198.51.100.4/24            |  |
+# | |    2001:db8:2::3/64           |   | | |    2001:db8:2::4/64           |  |
+# | +-------------------------------+   | | +-------------------------------+  |
+# +-------------------------------------+ +------------------------------------+
+
+: ${VXPORT:=4789}
+export VXPORT
+
+: ${ALL_TESTS:="
+	ping_ipv4
+	ping_ipv6
+	test_flood
+	test_unicast
+	reapply_config
+	ping_ipv4
+	ping_ipv6
+	test_flood
+	test_unicast
+	test_pvid
+	ping_ipv4
+	ping_ipv6
+	test_flood
+	test_pvid
+"}
+
+NUM_NETIFS=6
+source lib.sh
+source tc_common.sh
+
+h1_create()
+{
+	simple_if_init $h1
+	tc qdisc add dev $h1 clsact
+	vlan_create $h1 10 v$h1 192.0.2.1/28 2001:db8:1::1/64
+	vlan_create $h1 20 v$h1 198.51.100.1/24 2001:db8:2::1/64
+}
+
+h1_destroy()
+{
+	vlan_destroy $h1 20
+	vlan_destroy $h1 10
+	tc qdisc del dev $h1 clsact
+	simple_if_fini $h1
+}
+
+h2_create()
+{
+	simple_if_init $h2
+	tc qdisc add dev $h2 clsact
+	vlan_create $h2 10 v$h2 192.0.2.2/28 2001:db8:1::2/64
+	vlan_create $h2 20 v$h2 198.51.100.2/24 2001:db8:2::2/64
+}
+
+h2_destroy()
+{
+	vlan_destroy $h2 20
+	vlan_destroy $h2 10
+	tc qdisc del dev $h2 clsact
+	simple_if_fini $h2
+}
+
+rp1_set_addr()
+{
+	ip address add dev $rp1 2001:db8:3::1/64
+
+	ip route add 2001:db8:4::0/64 nexthop via 2001:db8:3::2
+	ip route add 2001:db8:5::0/64 nexthop via 2001:db8:3::2
+}
+
+rp1_unset_addr()
+{
+	ip route del 2001:db8:5::0/64 nexthop via 2001:db8:3::2
+	ip route del 2001:db8:4::0/64 nexthop via 2001:db8:3::2
+
+	ip address del dev $rp1 2001:db8:3::1/64
+}
+
+switch_create()
+{
+	ip link add name br1 type bridge vlan_filtering 1 vlan_default_pvid 0 \
+		mcast_snooping 0
+	# Make sure the bridge uses the MAC address of the local port and not
+	# that of the VxLAN's device.
+	ip link set dev br1 address $(mac_get $swp1)
+	ip link set dev br1 up
+
+	ip link set dev $rp1 up
+	rp1_set_addr
+	tc qdisc add dev $rp1 clsact
+
+	ip link add name vx10 type vxlan id 1000 local 2001:db8:3::1 \
+		dstport "$VXPORT" nolearning udp6zerocsumrx udp6zerocsumtx \
+		tos inherit ttl 100
+	ip link set dev vx10 up
+
+	ip link set dev vx10 master br1
+	bridge vlan add vid 10 dev vx10 pvid untagged
+
+	ip link add name vx20 type vxlan id 2000 local 2001:db8:3::1 \
+		dstport "$VXPORT" nolearning udp6zerocsumrx udp6zerocsumtx \
+		tos inherit ttl 100
+	ip link set dev vx20 up
+
+	ip link set dev vx20 master br1
+	bridge vlan add vid 20 dev vx20 pvid untagged
+
+	ip link set dev $swp1 master br1
+	ip link set dev $swp1 up
+	tc qdisc add dev $swp1 clsact
+	bridge vlan add vid 10 dev $swp1
+	bridge vlan add vid 20 dev $swp1
+
+	ip link set dev $swp2 master br1
+	ip link set dev $swp2 up
+	bridge vlan add vid 10 dev $swp2
+	bridge vlan add vid 20 dev $swp2
+
+	bridge fdb append dev vx10 00:00:00:00:00:00 dst 2001:db8:4::1 self
+	bridge fdb append dev vx10 00:00:00:00:00:00 dst 2001:db8:5::1 self
+
+	bridge fdb append dev vx20 00:00:00:00:00:00 dst 2001:db8:4::1 self
+	bridge fdb append dev vx20 00:00:00:00:00:00 dst 2001:db8:5::1 self
+}
+
+switch_destroy()
+{
+	bridge fdb del dev vx20 00:00:00:00:00:00 dst 2001:db8:5::1 self
+	bridge fdb del dev vx20 00:00:00:00:00:00 dst 2001:db8:4::1 self
+
+	bridge fdb del dev vx10 00:00:00:00:00:00 dst 2001:db8:5::1 self
+	bridge fdb del dev vx10 00:00:00:00:00:00 dst 2001:db8:4::1 self
+
+	bridge vlan del vid 20 dev $swp2
+	bridge vlan del vid 10 dev $swp2
+	ip link set dev $swp2 down
+	ip link set dev $swp2 nomaster
+
+	bridge vlan del vid 20 dev $swp1
+	bridge vlan del vid 10 dev $swp1
+	tc qdisc del dev $swp1 clsact
+	ip link set dev $swp1 down
+	ip link set dev $swp1 nomaster
+
+	bridge vlan del vid 20 dev vx20
+	ip link set dev vx20 nomaster
+
+	ip link set dev vx20 down
+	ip link del dev vx20
+
+	bridge vlan del vid 10 dev vx10
+	ip link set dev vx10 nomaster
+
+	ip link set dev vx10 down
+	ip link del dev vx10
+
+	tc qdisc del dev $rp1 clsact
+	rp1_unset_addr
+	ip link set dev $rp1 down
+
+	ip link set dev br1 down
+	ip link del dev br1
+}
+
+vrp2_create()
+{
+	simple_if_init $rp2 2001:db8:3::2/64
+	__simple_if_init v1 v$rp2 2001:db8:4::2/64
+	__simple_if_init v3 v$rp2 2001:db8:5::2/64
+	tc qdisc add dev v1 clsact
+}
+
+vrp2_destroy()
+{
+	tc qdisc del dev v1 clsact
+	__simple_if_fini v3 2001:db8:5::2/64
+	__simple_if_fini v1 2001:db8:4::2/64
+	simple_if_fini $rp2 2001:db8:3::2/64
+}
+
+ns_init_common()
+{
+	local in_if=$1; shift
+	local in_addr=$1; shift
+	local other_in_addr=$1; shift
+	local nh_addr=$1; shift
+	local host_addr1_ipv4=$1; shift
+	local host_addr1_ipv6=$1; shift
+	local host_addr2_ipv4=$1; shift
+	local host_addr2_ipv6=$1; shift
+
+	ip link set dev $in_if up
+	ip address add dev $in_if $in_addr/64
+	tc qdisc add dev $in_if clsact
+
+	ip link add name br2 type bridge vlan_filtering 1 vlan_default_pvid 0
+	ip link set dev br2 up
+
+	ip link add name w1 type veth peer name w2
+
+	ip link set dev w1 master br2
+	ip link set dev w1 up
+
+	bridge vlan add vid 10 dev w1
+	bridge vlan add vid 20 dev w1
+
+	ip link add name vx10 type vxlan id 1000 local $in_addr \
+		dstport "$VXPORT" udp6zerocsumrx
+	ip link set dev vx10 up
+	bridge fdb append dev vx10 00:00:00:00:00:00 dst 2001:db8:3::1 self
+	bridge fdb append dev vx10 00:00:00:00:00:00 dst $other_in_addr self
+
+	ip link set dev vx10 master br2
+	tc qdisc add dev vx10 clsact
+
+	bridge vlan add vid 10 dev vx10 pvid untagged
+
+	ip link add name vx20 type vxlan id 2000 local $in_addr \
+		dstport "$VXPORT" udp6zerocsumrx
+	ip link set dev vx20 up
+	bridge fdb append dev vx20 00:00:00:00:00:00 dst 2001:db8:3::1 self
+	bridge fdb append dev vx20 00:00:00:00:00:00 dst $other_in_addr self
+
+	ip link set dev vx20 master br2
+	tc qdisc add dev vx20 clsact
+
+	bridge vlan add vid 20 dev vx20 pvid untagged
+
+	simple_if_init w2
+        vlan_create w2 10 vw2 $host_addr1_ipv4/28 $host_addr1_ipv6/64
+        vlan_create w2 20 vw2 $host_addr2_ipv4/24 $host_addr2_ipv6/64
+
+	ip route add 2001:db8:3::0/64 nexthop via $nh_addr
+	ip route add $other_in_addr/128 nexthop via $nh_addr
+}
+export -f ns_init_common
+
+ns1_create()
+{
+	ip netns add ns1
+	ip link set dev v2 netns ns1
+	in_ns ns1 \
+	      ns_init_common v2 2001:db8:4::1 2001:db8:5::1 2001:db8:4::2 \
+	      192.0.2.3 2001:db8:1::3 198.51.100.3 2001:db8:2::3
+}
+
+ns1_destroy()
+{
+	ip netns exec ns1 ip link set dev v2 netns 1
+	ip netns del ns1
+}
+
+ns2_create()
+{
+	ip netns add ns2
+	ip link set dev v4 netns ns2
+	in_ns ns2 \
+	      ns_init_common v4 2001:db8:5::1 2001:db8:4::1 2001:db8:5::2 \
+	      192.0.2.4 2001:db8:1::4 198.51.100.4 2001:db8:2::4
+}
+
+ns2_destroy()
+{
+	ip netns exec ns2 ip link set dev v4 netns 1
+	ip netns del ns2
+}
+
+setup_prepare()
+{
+	h1=${NETIFS[p1]}
+	swp1=${NETIFS[p2]}
+
+	swp2=${NETIFS[p3]}
+	h2=${NETIFS[p4]}
+
+	rp1=${NETIFS[p5]}
+	rp2=${NETIFS[p6]}
+
+	vrf_prepare
+	forwarding_enable
+
+	h1_create
+	h2_create
+	switch_create
+
+	ip link add name v1 type veth peer name v2
+	ip link add name v3 type veth peer name v4
+	vrp2_create
+	ns1_create
+	ns2_create
+
+	r1_mac=$(in_ns ns1 mac_get w2)
+	r2_mac=$(in_ns ns2 mac_get w2)
+	h2_mac=$(mac_get $h2)
+}
+
+cleanup()
+{
+	pre_cleanup
+
+	ns2_destroy
+	ns1_destroy
+	vrp2_destroy
+	ip link del dev v3
+	ip link del dev v1
+
+	switch_destroy
+	h2_destroy
+	h1_destroy
+
+	forwarding_restore
+	vrf_cleanup
+}
+
+# For the first round of tests, vx10 and vx20 were the first devices to get
+# attached to the bridge, and at that point the local IP is already
+# configured. Try the other scenario of attaching these devices to a bridge
+# that already has local ports members, and only then assign the local IP.
+reapply_config()
+{
+	log_info "Reapplying configuration"
+
+	bridge fdb del dev vx20 00:00:00:00:00:00 dst 2001:db8:5::1 self
+	bridge fdb del dev vx20 00:00:00:00:00:00 dst 2001:db8:4::1 self
+
+	bridge fdb del dev vx10 00:00:00:00:00:00 dst 2001:db8:5::1 self
+	bridge fdb del dev vx10 00:00:00:00:00:00 dst 2001:db8:4::1 self
+
+	ip link set dev vx20 nomaster
+	ip link set dev vx10 nomaster
+
+	rp1_unset_addr
+	sleep 5
+
+	ip link set dev vx10 master br1
+	bridge vlan add vid 10 dev vx10 pvid untagged
+
+	ip link set dev vx20 master br1
+	bridge vlan add vid 20 dev vx20 pvid untagged
+
+	bridge fdb append dev vx10 00:00:00:00:00:00 dst 2001:db8:4::1 self
+	bridge fdb append dev vx10 00:00:00:00:00:00 dst 2001:db8:5::1 self
+
+	bridge fdb append dev vx20 00:00:00:00:00:00 dst 2001:db8:4::1 self
+	bridge fdb append dev vx20 00:00:00:00:00:00 dst 2001:db8:5::1 self
+
+	rp1_set_addr
+	sleep 5
+}
+
+__ping_ipv4()
+{
+	local vxlan_local_ip=$1; shift
+	local vxlan_remote_ip=$1; shift
+	local src_ip=$1; shift
+	local dst_ip=$1; shift
+	local dev=$1; shift
+	local info=$1; shift
+
+	RET=0
+
+	tc filter add dev $rp1 egress protocol ipv6 pref 1 handle 101 \
+		flower ip_proto udp src_ip $vxlan_local_ip \
+		dst_ip $vxlan_remote_ip dst_port $VXPORT $TC_FLAG action pass
+	# Match ICMP-reply packets after decapsulation, so source IP is
+	# destination IP of the ping and destination IP is source IP of the
+	# ping.
+	tc filter add dev $swp1 egress protocol 802.1q pref 1 handle 101 \
+		flower vlan_ethtype ipv4 src_ip $dst_ip dst_ip $src_ip \
+		$TC_FLAG action pass
+
+	# Send 100 packets and verify that at least 100 packets hit the rule,
+	# to overcome ARP noise.
+	PING_COUNT=100 PING_TIMEOUT=11 ping_do $dev $dst_ip
+	check_err $? "Ping failed"
+
+	tc_check_at_least_x_packets "dev $rp1 egress" 101 10 100
+	check_err $? "Encapsulated packets did not go through router"
+
+	tc_check_at_least_x_packets "dev $swp1 egress" 101 10 100
+	check_err $? "Decapsulated packets did not go through switch"
+
+	log_test "ping: $info"
+
+	tc filter del dev $swp1 egress
+	tc filter del dev $rp1 egress
+}
+
+ping_ipv4()
+{
+	RET=0
+
+	local local_sw_ip=2001:db8:3::1
+	local remote_ns1_ip=2001:db8:4::1
+	local remote_ns2_ip=2001:db8:5::1
+	local h1_10_ip=192.0.2.1
+	local h1_20_ip=198.51.100.1
+	local w2_10_ns1_ip=192.0.2.3
+	local w2_10_ns2_ip=192.0.2.4
+	local w2_20_ns1_ip=198.51.100.3
+	local w2_20_ns2_ip=198.51.100.4
+
+	ping_test $h1.10 192.0.2.2 ": local->local vid 10"
+	ping_test $h1.20 198.51.100.2 ": local->local vid 20"
+
+	__ping_ipv4 $local_sw_ip $remote_ns1_ip $h1_10_ip $w2_10_ns1_ip $h1.10 \
+		"local->remote 1 vid 10"
+	__ping_ipv4 $local_sw_ip $remote_ns2_ip $h1_10_ip $w2_10_ns2_ip $h1.10 \
+		"local->remote 2 vid 10"
+	__ping_ipv4 $local_sw_ip $remote_ns1_ip $h1_20_ip $w2_20_ns1_ip $h1.20 \
+		"local->remote 1 vid 20"
+	__ping_ipv4 $local_sw_ip $remote_ns2_ip $h1_20_ip $w2_20_ns2_ip $h1.20 \
+		"local->remote 2 vid 20"
+}
+
+__ping_ipv6()
+{
+	local vxlan_local_ip=$1; shift
+	local vxlan_remote_ip=$1; shift
+	local src_ip=$1; shift
+	local dst_ip=$1; shift
+	local dev=$1; shift
+	local info=$1; shift
+
+	RET=0
+
+	tc filter add dev $rp1 egress protocol ipv6 pref 1 handle 101 \
+		flower ip_proto udp src_ip $vxlan_local_ip \
+		dst_ip $vxlan_remote_ip dst_port $VXPORT $TC_FLAG action pass
+	# Match ICMP-reply packets after decapsulation, so source IP is
+	# destination IP of the ping and destination IP is source IP of the
+	# ping.
+	tc filter add dev $swp1 egress protocol 802.1q pref 1 handle 101 \
+		flower vlan_ethtype ipv6 src_ip $dst_ip dst_ip $src_ip \
+		$TC_FLAG action pass
+
+	# Send 100 packets and verify that at least 100 packets hit the rule,
+	# to overcome neighbor discovery noise.
+	PING_COUNT=100 PING_TIMEOUT=11 ping6_do $dev $dst_ip
+	check_err $? "Ping failed"
+
+	tc_check_at_least_x_packets "dev $rp1 egress" 101 100
+	check_err $? "Encapsulated packets did not go through router"
+
+	tc_check_at_least_x_packets "dev $swp1 egress" 101 100
+	check_err $? "Decapsulated packets did not go through switch"
+
+	log_test "ping6: $info"
+
+	tc filter del dev $swp1 egress
+	tc filter del dev $rp1 egress
+}
+
+ping_ipv6()
+{
+	RET=0
+
+	local local_sw_ip=2001:db8:3::1
+	local remote_ns1_ip=2001:db8:4::1
+	local remote_ns2_ip=2001:db8:5::1
+	local h1_10_ip=2001:db8:1::1
+	local h1_20_ip=2001:db8:2::1
+	local w2_10_ns1_ip=2001:db8:1::3
+	local w2_10_ns2_ip=2001:db8:1::4
+	local w2_20_ns1_ip=2001:db8:2::3
+	local w2_20_ns2_ip=2001:db8:2::4
+
+	ping6_test $h1.10 2001:db8:1::2 ": local->local vid 10"
+	ping6_test $h1.20 2001:db8:2::2 ": local->local vid 20"
+
+	__ping_ipv6 $local_sw_ip $remote_ns1_ip $h1_10_ip $w2_10_ns1_ip $h1.10 \
+		"local->remote 1 vid 10"
+	__ping_ipv6 $local_sw_ip $remote_ns2_ip $h1_10_ip $w2_10_ns2_ip $h1.10 \
+		"local->remote 2 vid 10"
+	__ping_ipv6 $local_sw_ip $remote_ns1_ip $h1_20_ip $w2_20_ns1_ip $h1.20 \
+		"local->remote 1 vid 20"
+	__ping_ipv6 $local_sw_ip $remote_ns2_ip $h1_20_ip $w2_20_ns2_ip $h1.20 \
+		"local->remote 2 vid 20"
+}
+
+maybe_in_ns()
+{
+	echo ${1:+in_ns} $1
+}
+
+__flood_counter_add_del()
+{
+	local add_del=$1; shift
+	local dst_ip=$1; shift
+	local dev=$1; shift
+	local ns=$1; shift
+
+	# Putting the ICMP capture both to HW and to SW will end up
+	# double-counting the packets that are trapped to slow path, such as for
+	# the unicast test. Adding either skip_hw or skip_sw fixes this problem,
+	# but with skip_hw, the flooded packets are not counted at all, because
+	# those are dropped due to MAC address mismatch; and skip_sw is a no-go
+	# for veth-based topologies.
+	#
+	# So try to install with skip_sw and fall back to skip_sw if that fails.
+
+	$(maybe_in_ns $ns) tc filter $add_del dev "$dev" ingress \
+	   proto ipv6 pref 100 flower dst_ip $dst_ip ip_proto \
+	   icmpv6 skip_sw action pass 2>/dev/null || \
+	$(maybe_in_ns $ns) tc filter $add_del dev "$dev" ingress \
+	   proto ipv6 pref 100 flower dst_ip $dst_ip ip_proto \
+	   icmpv6 skip_hw action pass
+}
+
+flood_counter_install()
+{
+	__flood_counter_add_del add "$@"
+}
+
+flood_counter_uninstall()
+{
+	__flood_counter_add_del del "$@"
+}
+
+flood_fetch_stat()
+{
+	local dev=$1; shift
+	local ns=$1; shift
+
+	$(maybe_in_ns $ns) tc_rule_stats_get $dev 100 ingress
+}
+
+flood_fetch_stats()
+{
+	local counters=("${@}")
+	local counter
+
+	for counter in "${counters[@]}"; do
+		flood_fetch_stat $counter
+	done
+}
+
+vxlan_flood_test()
+{
+	local mac=$1; shift
+	local dst=$1; shift
+	local vid=$1; shift
+	local -a expects=("${@}")
+
+	local -a counters=($h2 "vx10 ns1" "vx20 ns1" "vx10 ns2" "vx20 ns2")
+	local counter
+	local key
+
+	# Packets reach the local host tagged whereas they reach the VxLAN
+	# devices untagged. In order to be able to use the same filter for
+	# all counters, make sure the packets also reach the local host
+	# untagged
+	bridge vlan add vid $vid dev $swp2 untagged
+	for counter in "${counters[@]}"; do
+		flood_counter_install $dst $counter
+	done
+
+	local -a t0s=($(flood_fetch_stats "${counters[@]}"))
+	$MZ -6 $h1 -Q $vid -c 10 -d 100msec -p 64 -b $mac -B $dst -t icmp6 type=128 -q
+	sleep 1
+	local -a t1s=($(flood_fetch_stats "${counters[@]}"))
+
+	for key in ${!t0s[@]}; do
+		local delta=$((t1s[$key] - t0s[$key]))
+		local expect=${expects[$key]}
+
+		((expect == delta))
+		check_err $? "${counters[$key]}: Expected to capture $expect packets, got $delta."
+	done
+
+	for counter in "${counters[@]}"; do
+		flood_counter_uninstall $dst $counter
+	done
+	bridge vlan add vid $vid dev $swp2
+}
+
+__test_flood()
+{
+	local mac=$1; shift
+	local dst=$1; shift
+	local vid=$1; shift
+	local what=$1; shift
+	local -a expects=("${@}")
+
+	RET=0
+
+	vxlan_flood_test $mac $dst $vid "${expects[@]}"
+
+	log_test "VXLAN: $what"
+}
+
+test_flood()
+{
+	__test_flood de:ad:be:ef:13:37 2001:db8:1::100 10 "flood vlan 10" \
+		10 10 0 10 0
+	__test_flood ca:fe:be:ef:13:37 2001:db8:2::100 20 "flood vlan 20" \
+		10 0 10 0 10
+}
+
+vxlan_fdb_add_del()
+{
+	local add_del=$1; shift
+	local vid=$1; shift
+	local mac=$1; shift
+	local dev=$1; shift
+	local dst=$1; shift
+
+	bridge fdb $add_del dev $dev $mac self static permanent \
+		${dst:+dst} $dst 2>/dev/null
+	bridge fdb $add_del dev $dev $mac master static vlan $vid 2>/dev/null
+}
+
+__test_unicast()
+{
+	local mac=$1; shift
+	local dst=$1; shift
+	local hit_idx=$1; shift
+	local vid=$1; shift
+	local what=$1; shift
+
+	RET=0
+
+	local -a expects=(0 0 0 0 0)
+	expects[$hit_idx]=10
+
+	vxlan_flood_test $mac $dst $vid "${expects[@]}"
+
+	log_test "VXLAN: $what"
+}
+
+test_unicast()
+{
+	local -a targets=("$h2_mac $h2"
+			  "$r1_mac vx10 2001:db8:4::1"
+			  "$r2_mac vx10 2001:db8:5::1")
+	local target
+
+	log_info "unicast vlan 10"
+
+	for target in "${targets[@]}"; do
+		vxlan_fdb_add_del add 10 $target
+	done
+
+	__test_unicast $h2_mac 2001:db8:1::2 0 10 "local MAC unicast"
+	__test_unicast $r1_mac 2001:db8:1::3 1 10 "remote MAC 1 unicast"
+	__test_unicast $r2_mac 2001:db8:1::4 3 10 "remote MAC 2 unicast"
+
+	for target in "${targets[@]}"; do
+		vxlan_fdb_add_del del 10 $target
+	done
+
+	log_info "unicast vlan 20"
+
+	targets=("$h2_mac $h2" "$r1_mac vx20 2001:db8:4::1" \
+		 "$r2_mac vx20 2001:db8:5::1")
+
+	for target in "${targets[@]}"; do
+		vxlan_fdb_add_del add 20 $target
+	done
+
+	__test_unicast $h2_mac 2001:db8:2::2 0 20 "local MAC unicast"
+	__test_unicast $r1_mac 2001:db8:2::3 2 20 "remote MAC 1 unicast"
+	__test_unicast $r2_mac 2001:db8:2::4 4 20 "remote MAC 2 unicast"
+
+	for target in "${targets[@]}"; do
+		vxlan_fdb_add_del del 20 $target
+	done
+}
+
+test_pvid()
+{
+	local -a expects=(0 0 0 0 0)
+	local mac=de:ad:be:ef:13:37
+	local dst=2001:db8:1::100
+	local vid=10
+
+	# Check that flooding works
+	RET=0
+
+	expects[0]=10; expects[1]=10; expects[3]=10
+	vxlan_flood_test $mac $dst $vid "${expects[@]}"
+
+	log_test "VXLAN: flood before pvid off"
+
+	# Toggle PVID off and test that flood to remote hosts does not work
+	RET=0
+
+	bridge vlan add vid 10 dev vx10
+
+	expects[0]=10; expects[1]=0; expects[3]=0
+	vxlan_flood_test $mac $dst $vid "${expects[@]}"
+
+	log_test "VXLAN: flood after pvid off"
+
+	# Toggle PVID on and test that flood to remote hosts does work
+	RET=0
+
+	bridge vlan add vid 10 dev vx10 pvid untagged
+
+	expects[0]=10; expects[1]=10; expects[3]=10
+	vxlan_flood_test $mac $dst $vid "${expects[@]}"
+
+	log_test "VXLAN: flood after pvid on"
+
+	# Add a new VLAN and test that it does not affect flooding
+	RET=0
+
+	bridge vlan add vid 30 dev vx10
+
+	expects[0]=10; expects[1]=10; expects[3]=10
+	vxlan_flood_test $mac $dst $vid "${expects[@]}"
+
+	bridge vlan del vid 30 dev vx10
+
+	log_test "VXLAN: flood after vlan add"
+
+	# Remove currently mapped VLAN and test that flood to remote hosts does
+	# not work
+	RET=0
+
+	bridge vlan del vid 10 dev vx10
+
+	expects[0]=10; expects[1]=0; expects[3]=0
+	vxlan_flood_test $mac $dst $vid "${expects[@]}"
+
+	log_test "VXLAN: flood after vlan delete"
+
+	# Re-add the VLAN and test that flood to remote hosts does work
+	RET=0
+
+	bridge vlan add vid 10 dev vx10 pvid untagged
+
+	expects[0]=10; expects[1]=10; expects[3]=10
+	vxlan_flood_test $mac $dst $vid "${expects[@]}"
+
+	log_test "VXLAN: flood after vlan re-add"
+}
+
+test_all()
+{
+	log_info "Running tests with UDP port $VXPORT"
+	tests_run
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+test_all
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1q_port_8472_ipv6.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1q_port_8472_ipv6.sh
new file mode 100755
index 000000000000..344f43ccb755
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1q_port_8472_ipv6.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# A wrapper to run VXLAN tests with an unusual port number.
+
+VXPORT=8472
+ALL_TESTS="
+	ping_ipv4
+	ping_ipv6
+"
+source vxlan_bridge_1q_ipv6.sh
-- 
cgit v1.2.3


From dc498cdda0ce76527e276e504324de681482d48a Mon Sep 17 00:00:00 2001
From: Amit Cohen <amcohen@nvidia.com>
Date: Tue, 21 Dec 2021 16:49:46 +0200
Subject: selftests: forwarding: vxlan_bridge_1q: Remove unused function

Remove `vxlan_ping_test()` which is not used and probably was copied
mistakenly from vxlan_bridge_1d.sh.

This was found while adding an equivalent test for IPv6.

Signed-off-by: Amit Cohen <amcohen@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../selftests/net/forwarding/vxlan_bridge_1q.sh      | 20 --------------------
 1 file changed, 20 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1q.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1q.sh
index a5789721ba92..a596bbf3ed6a 100755
--- a/tools/testing/selftests/net/forwarding/vxlan_bridge_1q.sh
+++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1q.sh
@@ -680,26 +680,6 @@ test_pvid()
 	log_test "VXLAN: flood after vlan re-add"
 }
 
-vxlan_ping_test()
-{
-	local ping_dev=$1; shift
-	local ping_dip=$1; shift
-	local ping_args=$1; shift
-	local capture_dev=$1; shift
-	local capture_dir=$1; shift
-	local capture_pref=$1; shift
-	local expect=$1; shift
-
-	local t0=$(tc_rule_stats_get $capture_dev $capture_pref $capture_dir)
-	ping_do $ping_dev $ping_dip "$ping_args"
-	local t1=$(tc_rule_stats_get $capture_dev $capture_pref $capture_dir)
-	local delta=$((t1 - t0))
-
-	# Tolerate a couple stray extra packets.
-	((expect <= delta && delta <= expect + 2))
-	check_err $? "$capture_dev: Expected to capture $expect packets, got $delta."
-}
-
 __test_learning()
 {
 	local -a expects=(0 0 0 0 0)
-- 
cgit v1.2.3


From 2902bae465c0aa718d18c72cceab95c7c2686afc Mon Sep 17 00:00:00 2001
From: Amit Cohen <amcohen@nvidia.com>
Date: Tue, 21 Dec 2021 16:49:47 +0200
Subject: selftests: forwarding: Add a test for VxLAN asymmetric routing with
 IPv6

In asymmetric routing the ingress VTEP routes the packet into the
correct VxLAN tunnel, whereas the egress VTEP only bridges the packet to
the correct host. Therefore, packets in different directions use
different VNIs - the target VNI.

Add a test which is similar to the existing IPv4 test to check IPv6.

The test uses a simple topology with two VTEPs and two VNIs and verifies
that ping passes between hosts (local / remote) in the same VLAN (VNI)
and in different VLANs belonging to the same tenant (VRF).

While the test does not check VM mobility, it does configure an anycast
gateway using a macvlan device on both VTEPs.

Signed-off-by: Amit Cohen <amcohen@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../net/forwarding/vxlan_asymmetric_ipv6.sh        | 504 +++++++++++++++++++++
 1 file changed, 504 insertions(+)
 create mode 100755 tools/testing/selftests/net/forwarding/vxlan_asymmetric_ipv6.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/vxlan_asymmetric_ipv6.sh b/tools/testing/selftests/net/forwarding/vxlan_asymmetric_ipv6.sh
new file mode 100755
index 000000000000..f4930098974f
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/vxlan_asymmetric_ipv6.sh
@@ -0,0 +1,504 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# +--------------------------------+            +-----------------------------+
+# |                         vrf-h1 |            |                      vrf-h2 |
+# |    + $h1                       |            | + $h2                       |
+# |    | 2001:db8:1::1/64          |            | | 2001:db8:2::1/64          |
+# |    | default via 2001:db8:1::3 |            | | default via 2001:db8:2::3 |
+# +----|---------------------------+            +-|---------------------------+
+#      |                                          |
+# +----|------------------------------------------|---------------------------+
+# | SW |                                          |                           |
+# | +--|------------------------------------------|-------------------------+ |
+# | |  + $swp1                         br1        + $swp2                   | |
+# | |     vid 10 pvid untagged                       vid 20 pvid untagged   | |
+# | |                                                                       | |
+# | |  + vx10                                     + vx20                    | |
+# | |    local 2001:db8:3::1                        local 2001:db8:3::1     | |
+# | |    remote 2001:db8:3::2                       remote 2001:db8:3::2    | |
+# | |    id 1000                                    id 2000                 | |
+# | |    dstport 4789                               dstport 4789            | |
+# | |    vid 10 pvid untagged                       vid 20 pvid untagged    | |
+# | |                                                                       | |
+# | +-----------------------------------+-----------------------------------+ |
+# |                                     |                                     |
+# | +-----------------------------------|-----------------------------------+ |
+# | |                                   |                                   | |
+# | |  +--------------------------------+--------------------------------+  | |
+# | |  |                                                                 |  | |
+# | |  + vlan10                                                   vlan20 +  | |
+# | |  | 2001:db8:1::2/64                               2001:db8:2::2/64 |  | |
+# | |  |                                                                 |  | |
+# | |  + vlan10-v (macvlan)                           vlan20-v (macvlan) +  | |
+# | |    2001:db8:1::3/64                               2001:db8:2::3/64    | |
+# | |    00:00:5e:00:01:01                             00:00:5e:00:01:01    | |
+# | |                               vrf-green                               | |
+# | +-----------------------------------------------------------------------+ |
+# |                                                                           |
+# |    + $rp1                                       +lo                       |
+# |    | 2001:db8:4::1/64                            2001:db8:3::1/128        |
+# +----|----------------------------------------------------------------------+
+#      |
+# +----|--------------------------------------------------------+
+# |    |                            vrf-spine                   |
+# |    + $rp2                                                   |
+# |      2001:db8:4::2/64                                       |
+# |                                                             |   (maybe) HW
+# =============================================================================
+# |                                                             |  (likely) SW
+# |                                                             |
+# |    + v1 (veth)                                              |
+# |    | 2001:db8:5::2/64                                       |
+# +----|--------------------------------------------------------+
+#      |
+# +----|----------------------------------------------------------------------+
+# |    + v2 (veth)                                  +lo           NS1 (netns) |
+# |      2001:db8:5::1/64                            2001:db8:3::2/128        |
+# |                                                                           |
+# | +-----------------------------------------------------------------------+ |
+# | |                               vrf-green                               | |
+# | |  + vlan10-v (macvlan)                           vlan20-v (macvlan) +  | |
+# | |  | 2001:db8:1::3/64                               2001:db8:2::3/64 |  | |
+# | |  | 00:00:5e:00:01:01                             00:00:5e:00:01:01 |  | |
+# | |  |                                                                 |  | |
+# | |  + vlan10                                                   vlan20 +  | |
+# | |  | 2001:db8:1::3/64                               2001:db8:2::3/64 |  | |
+# | |  |                                                                 |  | |
+# | |  +--------------------------------+--------------------------------+  | |
+# | |                                   |                                   | |
+# | +-----------------------------------|-----------------------------------+ |
+# |                                     |                                     |
+# | +-----------------------------------+-----------------------------------+ |
+# | |                                                                       | |
+# | |  + vx10                                     + vx20                    | |
+# | |    local 2001:db8:3::2                        local 2001:db8:3::2     | |
+# | |    remote 2001:db8:3::1                       remote 2001:db8:3::1    | |
+# | |    id 1000                                    id 2000                 | |
+# | |    dstport 4789                               dstport 4789            | |
+# | |    vid 10 pvid untagged                       vid 20 pvid untagged    | |
+# | |                                                                       | |
+# | |  + w1 (veth)                                + w3 (veth)               | |
+# | |  | vid 10 pvid untagged          br1        | vid 20 pvid untagged    | |
+# | +--|------------------------------------------|-------------------------+ |
+# |    |                                          |                           |
+# |    |                                          |                           |
+# | +--|----------------------+                +--|-------------------------+ |
+# | |  |               vrf-h1 |                |  |                  vrf-h2 | |
+# | |  + w2 (veth)            |                |  + w4 (veth)               | |
+# | |    2001:db8:1::4/64     |                |    2001:db8:2::4/64        | |
+# | |    default via          |                |    default via             | |
+# | |    2001:db8:1::3/64     |                |    2001:db8:2::3/64        | |
+# | +-------------------------+                +----------------------------+ |
+# +---------------------------------------------------------------------------+
+
+ALL_TESTS="
+	ping_ipv6
+	arp_decap
+"
+NUM_NETIFS=6
+source lib.sh
+
+require_command $ARPING
+
+hx_create()
+{
+	local vrf_name=$1; shift
+	local if_name=$1; shift
+	local ip_addr=$1; shift
+	local gw_ip=$1; shift
+
+	vrf_create $vrf_name
+	ip link set dev $if_name master $vrf_name
+	ip link set dev $vrf_name up
+	ip link set dev $if_name up
+
+	ip address add $ip_addr/64 dev $if_name
+	ip neigh replace $gw_ip lladdr 00:00:5e:00:01:01 nud permanent \
+		dev $if_name
+	ip route add default vrf $vrf_name nexthop via $gw_ip
+}
+export -f hx_create
+
+hx_destroy()
+{
+	local vrf_name=$1; shift
+	local if_name=$1; shift
+	local ip_addr=$1; shift
+	local gw_ip=$1; shift
+
+	ip route del default vrf $vrf_name nexthop via $gw_ip
+	ip neigh del $gw_ip dev $if_name
+	ip address del $ip_addr/64 dev $if_name
+
+	ip link set dev $if_name down
+	vrf_destroy $vrf_name
+}
+
+h1_create()
+{
+	hx_create "vrf-h1" $h1 2001:db8:1::1 2001:db8:1::3
+}
+
+h1_destroy()
+{
+	hx_destroy "vrf-h1" $h1 2001:db8:1::1 2001:db8:1::3
+}
+
+h2_create()
+{
+	hx_create "vrf-h2" $h2 2001:db8:2::1 2001:db8:2::3
+}
+
+h2_destroy()
+{
+	hx_destroy "vrf-h2" $h2 2001:db8:2::1 2001:db8:2::3
+}
+
+switch_create()
+{
+	ip link add name br1 type bridge vlan_filtering 1 vlan_default_pvid 0 \
+		mcast_snooping 0
+	# Make sure the bridge uses the MAC address of the local port and not
+	# that of the VxLAN's device.
+	ip link set dev br1 address $(mac_get $swp1)
+	ip link set dev br1 up
+
+	ip link set dev $rp1 up
+	ip address add dev $rp1 2001:db8:4::1/64
+	ip route add 2001:db8:3::2/128 nexthop via 2001:db8:4::2
+
+	ip link add name vx10 type vxlan id 1000		\
+		local 2001:db8:3::1 remote 2001:db8:3::2 dstport 4789	\
+		nolearning udp6zerocsumrx udp6zerocsumtx tos inherit ttl 100
+	ip link set dev vx10 up
+
+	ip link set dev vx10 master br1
+	bridge vlan add vid 10 dev vx10 pvid untagged
+
+	ip link add name vx20 type vxlan id 2000		\
+		local 2001:db8:3::1 remote 2001:db8:3::2 dstport 4789	\
+		nolearning udp6zerocsumrx udp6zerocsumtx tos inherit ttl 100
+	ip link set dev vx20 up
+
+	ip link set dev vx20 master br1
+	bridge vlan add vid 20 dev vx20 pvid untagged
+
+	ip link set dev $swp1 master br1
+	ip link set dev $swp1 up
+	bridge vlan add vid 10 dev $swp1 pvid untagged
+
+	ip link set dev $swp2 master br1
+	ip link set dev $swp2 up
+	bridge vlan add vid 20 dev $swp2 pvid untagged
+
+	ip address add 2001:db8:3::1/128 dev lo
+
+	# Create SVIs
+	vrf_create "vrf-green"
+	ip link set dev vrf-green up
+
+	ip link add link br1 name vlan10 up master vrf-green type vlan id 10
+	ip address add 2001:db8:1::2/64 dev vlan10
+	ip link add link vlan10 name vlan10-v up master vrf-green \
+		address 00:00:5e:00:01:01 type macvlan mode private
+	ip address add 2001:db8:1::3/64 dev vlan10-v
+
+	ip link add link br1 name vlan20 up master vrf-green type vlan id 20
+	ip address add 2001:db8:2::2/64 dev vlan20
+	ip link add link vlan20 name vlan20-v up master vrf-green \
+		address 00:00:5e:00:01:01 type macvlan mode private
+	ip address add 2001:db8:2::3/64 dev vlan20-v
+
+	bridge vlan add vid 10 dev br1 self
+	bridge vlan add vid 20 dev br1 self
+
+	bridge fdb add 00:00:5e:00:01:01 dev br1 self local vlan 10
+	bridge fdb add 00:00:5e:00:01:01 dev br1 self local vlan 20
+
+}
+
+switch_destroy()
+{
+	bridge fdb del 00:00:5e:00:01:01 dev br1 self local vlan 20
+	bridge fdb del 00:00:5e:00:01:01 dev br1 self local vlan 10
+
+	bridge vlan del vid 20 dev br1 self
+	bridge vlan del vid 10 dev br1 self
+
+	ip link del dev vlan20
+
+	ip link del dev vlan10
+
+	vrf_destroy "vrf-green"
+
+	ip address del 2001:db8:3::1/128 dev lo
+
+	bridge vlan del vid 20 dev $swp2
+	ip link set dev $swp2 down
+	ip link set dev $swp2 nomaster
+
+	bridge vlan del vid 10 dev $swp1
+	ip link set dev $swp1 down
+	ip link set dev $swp1 nomaster
+
+	bridge vlan del vid 20 dev vx20
+	ip link set dev vx20 nomaster
+
+	ip link set dev vx20 down
+	ip link del dev vx20
+
+	bridge vlan del vid 10 dev vx10
+	ip link set dev vx10 nomaster
+
+	ip link set dev vx10 down
+	ip link del dev vx10
+
+	ip route del 2001:db8:3::2 nexthop via 2001:db8:4::2
+	ip address del dev $rp1 2001:db8:4::1/64
+	ip link set dev $rp1 down
+
+	ip link set dev br1 down
+	ip link del dev br1
+}
+
+spine_create()
+{
+	vrf_create "vrf-spine"
+	ip link set dev $rp2 master vrf-spine
+	ip link set dev v1 master vrf-spine
+	ip link set dev vrf-spine up
+	ip link set dev $rp2 up
+	ip link set dev v1 up
+
+	ip address add 2001:db8:4::2/64 dev $rp2
+	ip address add 2001:db8:5::2/64 dev v1
+
+	ip route add 2001:db8:3::1/128 vrf vrf-spine nexthop via \
+		2001:db8:4::1
+	ip route add 2001:db8:3::2/128 vrf vrf-spine nexthop via \
+		2001:db8:5::1
+}
+
+spine_destroy()
+{
+	ip route del 2001:db8:3::2/128 vrf vrf-spine nexthop via \
+		2001:db8:5::1
+	ip route del 2001:db8:3::1/128 vrf vrf-spine nexthop via \
+		2001:db8:4::1
+
+	ip address del 2001:db8:5::2/64 dev v1
+	ip address del 2001:db8:4::2/64 dev $rp2
+
+	ip link set dev v1 down
+	ip link set dev $rp2 down
+	vrf_destroy "vrf-spine"
+}
+
+ns_h1_create()
+{
+	hx_create "vrf-h1" w2 2001:db8:1::4 2001:db8:1::3
+}
+export -f ns_h1_create
+
+ns_h2_create()
+{
+	hx_create "vrf-h2" w4 2001:db8:2::4 2001:db8:2::3
+}
+export -f ns_h2_create
+
+ns_switch_create()
+{
+	ip link add name br1 type bridge vlan_filtering 1 vlan_default_pvid 0 \
+		mcast_snooping 0
+	ip link set dev br1 up
+
+	ip link set dev v2 up
+	ip address add dev v2 2001:db8:5::1/64
+	ip route add 2001:db8:3::1 nexthop via 2001:db8:5::2
+
+	ip link add name vx10 type vxlan id 1000		\
+		local 2001:db8:3::2 remote 2001:db8:3::1 dstport 4789	\
+		nolearning udp6zerocsumrx udp6zerocsumtx tos inherit ttl 100
+	ip link set dev vx10 up
+
+	ip link set dev vx10 master br1
+	bridge vlan add vid 10 dev vx10 pvid untagged
+
+	ip link add name vx20 type vxlan id 2000		\
+		local 2001:db8:3::2 remote 2001:db8:3::1 dstport 4789	\
+		nolearning udp6zerocsumrx udp6zerocsumtx tos inherit ttl 100
+	ip link set dev vx20 up
+
+	ip link set dev vx20 master br1
+	bridge vlan add vid 20 dev vx20 pvid untagged
+
+	ip link set dev w1 master br1
+	ip link set dev w1 up
+	bridge vlan add vid 10 dev w1 pvid untagged
+
+	ip link set dev w3 master br1
+	ip link set dev w3 up
+	bridge vlan add vid 20 dev w3 pvid untagged
+
+	ip address add 2001:db8:3::2/128 dev lo
+
+	# Create SVIs
+	vrf_create "vrf-green"
+	ip link set dev vrf-green up
+
+	ip link add link br1 name vlan10 up master vrf-green type vlan id 10
+	ip address add 2001:db8:1::3/64 dev vlan10
+	ip link add link vlan10 name vlan10-v up master vrf-green \
+		address 00:00:5e:00:01:01 type macvlan mode private
+	ip address add 2001:db8:1::3/64 dev vlan10-v
+
+	ip link add link br1 name vlan20 up master vrf-green type vlan id 20
+	ip address add 2001:db8:2::3/64 dev vlan20
+	ip link add link vlan20 name vlan20-v up master vrf-green \
+		address 00:00:5e:00:01:01 type macvlan mode private
+	ip address add 2001:db8:2::3/64 dev vlan20-v
+
+	bridge vlan add vid 10 dev br1 self
+	bridge vlan add vid 20 dev br1 self
+
+	bridge fdb add 00:00:5e:00:01:01 dev br1 self local vlan 10
+	bridge fdb add 00:00:5e:00:01:01 dev br1 self local vlan 20
+}
+export -f ns_switch_create
+
+ns_init()
+{
+	ip link add name w1 type veth peer name w2
+	ip link add name w3 type veth peer name w4
+
+	ip link set dev lo up
+
+	ns_h1_create
+	ns_h2_create
+	ns_switch_create
+}
+export -f ns_init
+
+ns1_create()
+{
+	ip netns add ns1
+	ip link set dev v2 netns ns1
+	in_ns ns1 ns_init
+}
+
+ns1_destroy()
+{
+	ip netns exec ns1 ip link set dev v2 netns 1
+	ip netns del ns1
+}
+
+macs_populate()
+{
+	local mac1=$1; shift
+	local mac2=$1; shift
+	local ip1=$1; shift
+	local ip2=$1; shift
+	local dst=$1; shift
+
+	bridge fdb add $mac1 dev vx10 self master extern_learn static \
+		dst $dst vlan 10
+	bridge fdb add $mac2 dev vx20 self master extern_learn static \
+		dst $dst vlan 20
+
+	ip neigh add $ip1 lladdr $mac1 nud noarp dev vlan10 \
+		extern_learn
+	ip neigh add $ip2 lladdr $mac2 nud noarp dev vlan20 \
+		extern_learn
+}
+export -f macs_populate
+
+macs_initialize()
+{
+	local h1_ns_mac=$(in_ns ns1 mac_get w2)
+	local h2_ns_mac=$(in_ns ns1 mac_get w4)
+	local h1_mac=$(mac_get $h1)
+	local h2_mac=$(mac_get $h2)
+
+	macs_populate $h1_ns_mac $h2_ns_mac 2001:db8:1::4 2001:db8:2::4 \
+		2001:db8:3::2
+	in_ns ns1 macs_populate $h1_mac $h2_mac 2001:db8:1::1 2001:db8:2::1 \
+		2001:db8:3::1
+}
+
+setup_prepare()
+{
+	h1=${NETIFS[p1]}
+	swp1=${NETIFS[p2]}
+
+	swp2=${NETIFS[p3]}
+	h2=${NETIFS[p4]}
+
+	rp1=${NETIFS[p5]}
+	rp2=${NETIFS[p6]}
+
+	vrf_prepare
+	forwarding_enable
+
+	h1_create
+	h2_create
+	switch_create
+
+	ip link add name v1 type veth peer name v2
+	spine_create
+	ns1_create
+	in_ns ns1 forwarding_enable
+
+	macs_initialize
+}
+
+cleanup()
+{
+	pre_cleanup
+
+	ns1_destroy
+	spine_destroy
+	ip link del dev v1
+
+	switch_destroy
+	h2_destroy
+	h1_destroy
+
+	forwarding_restore
+	vrf_cleanup
+}
+
+ping_ipv6()
+{
+	ping6_test $h1 2001:db8:2::1 ": local->local vid 10->vid 20"
+	ping6_test $h1 2001:db8:1::4 ": local->remote vid 10->vid 10"
+	ping6_test $h2 2001:db8:2::4 ": local->remote vid 20->vid 20"
+	ping6_test $h1 2001:db8:2::4 ": local->remote vid 10->vid 20"
+	ping6_test $h2 2001:db8:1::4 ": local->remote vid 20->vid 10"
+}
+
+arp_decap()
+{
+	# Repeat the ping tests, but without populating the neighbours. This
+	# makes sure we correctly decapsulate ARP packets
+	log_info "deleting neighbours from vlan interfaces"
+
+	ip neigh del 2001:db8:1::4 dev vlan10
+	ip neigh del 2001:db8:2::4 dev vlan20
+
+	ping_ipv6
+
+	ip neigh replace 2001:db8:1::4 lladdr $(in_ns ns1 mac_get w2) \
+		nud noarp dev vlan10 extern_learn
+	ip neigh replace 2001:db8:2::4 lladdr $(in_ns ns1 mac_get w4) \
+		nud noarp dev vlan20 extern_learn
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
-- 
cgit v1.2.3


From 6c6ea78a116164f6672e05554ec32bed1b195432 Mon Sep 17 00:00:00 2001
From: Amit Cohen <amcohen@nvidia.com>
Date: Tue, 21 Dec 2021 16:49:48 +0200
Subject: selftests: forwarding: Add a test for VxLAN symmetric routing with
 IPv6

In a similar fashion to the asymmetric test, add a test for symmetric
routing. In symmetric routing both the ingress and egress VTEPs perform
routing in the overlay network into / from the VxLAN tunnel. Packets in
different directions use the same VNI - the L3 VNI.
Different tenants (VRFs) use different L3 VNIs.

Add a test which is similar to the existing IPv4 test to check IPv6.

Signed-off-by: Amit Cohen <amcohen@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../net/forwarding/vxlan_symmetric_ipv6.sh         | 563 +++++++++++++++++++++
 1 file changed, 563 insertions(+)
 create mode 100755 tools/testing/selftests/net/forwarding/vxlan_symmetric_ipv6.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/vxlan_symmetric_ipv6.sh b/tools/testing/selftests/net/forwarding/vxlan_symmetric_ipv6.sh
new file mode 100755
index 000000000000..904633427fd0
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/vxlan_symmetric_ipv6.sh
@@ -0,0 +1,563 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+
+# +--------------------------------+            +-----------------------------+
+# |                         vrf-h1 |            |                      vrf-h2 |
+# |    + $h1                       |            | + $h2                       |
+# |    | 2001:db8:1::1/64          |            | | 2001:db8:2::1/64          |
+# |    | default via 2001:db8:1::3 |            | | default via 2001:db8:2::3 |
+# +----|---------------------------+            +-|---------------------------+
+#      |                                          |
+# +----|------------------------------------------|---------------------------+
+# | SW |                                          |                           |
+# | +--|------------------------------------------|-------------------------+ |
+# | |  + $swp1                         br1          + $swp2                 | |
+# | |     vid 10 pvid untagged                         vid 20 pvid untagged | |
+# | |                                                                       | |
+# | |  + vx10                                       + vx20                  | |
+# | |    local 2001:db8:3::1                          local 2001:db8:3::1   | |
+# | |    remote 2001:db8:3::2                         remote 2001:db8:3::2  | |
+# | |    id 1010                                      id 1020               | |
+# | |    dstport 4789                                 dstport 4789          | |
+# | |    vid 10 pvid untagged                         vid 20 pvid untagged  | |
+# | |                                                                       | |
+# | |                             + vx4001                                  | |
+# | |                               local 2001:db8:3::1                     | |
+# | |                               remote 2001:db8:3::2                    | |
+# | |                               id 104001                               | |
+# | |                               dstport 4789                            | |
+# | |                               vid 4001 pvid untagged                  | |
+# | |                                                                       | |
+# | +-----------------------------------+-----------------------------------+ |
+# |                                     |                                     |
+# | +-----------------------------------|-----------------------------------+ |
+# | |                                   |                                   | |
+# | |  +--------------------------------+--------------------------------+  | |
+# | |  |                                |                                |  | |
+# | |  + vlan10                         |                         vlan20 +  | |
+# | |  | 2001:db8:1::2/64               |               2001:db8:2::2/64 |  | |
+# | |  |                                |                                |  | |
+# | |  + vlan10-v (macvlan)             +             vlan20-v (macvlan) +  | |
+# | |    2001:db8:1::3/64           vlan4001            2001:db8:2::3/64    | |
+# | |    00:00:5e:00:01:01                             00:00:5e:00:01:01    | |
+# | |                               vrf-green                               | |
+# | +-----------------------------------------------------------------------+ |
+# |                                                                           |
+# |    + $rp1                                       +lo                       |
+# |    | 2001:db8:4::1/64                           2001:db8:3::1             |
+# +----|----------------------------------------------------------------------+
+#      |
+# +----|--------------------------------------------------------+
+# |    |                            vrf-spine                   |
+# |    + $rp2                                                   |
+# |      2001:db8:4::2/64                                       |
+# |                                                             |   (maybe) HW
+# =============================================================================
+# |                                                             |  (likely) SW
+# |                                                             |
+# |    + v1 (veth)                                              |
+# |    | 2001:db8:5::2/64                                       |
+# +----|--------------------------------------------------------+
+#      |
+# +----|----------------------------------------------------------------------+
+# |    + v2 (veth)                                  +lo           NS1 (netns) |
+# |      2001:db8:5::1/64                            2001:db8:3::2/128        |
+# |                                                                           |
+# | +-----------------------------------------------------------------------+ |
+# | |                               vrf-green                               | |
+# | |  + vlan10-v (macvlan)                           vlan20-v (macvlan) +  | |
+# | |  | 2001:db8:1::3/64                               2001:db8:2::3/64 |  | |
+# | |  | 00:00:5e:00:01:01                             00:00:5e:00:01:01 |  | |
+# | |  |                            vlan4001                             |  | |
+# | |  + vlan10                         +                         vlan20 +  | |
+# | |  | 2001:db8:1::3/64               |               2001:db8:2::3/64 |  | |
+# | |  |                                |                                |  | |
+# | |  +--------------------------------+--------------------------------+  | |
+# | |                                   |                                   | |
+# | +-----------------------------------|-----------------------------------+ |
+# |                                     |                                     |
+# | +-----------------------------------+-----------------------------------+ |
+# | |                                                                       | |
+# | |  + vx10                                     + vx20                    | |
+# | |    local 2001:db8:3::2                        local 2001:db8:3::2     | |
+# | |    remote 2001:db8:3::1                       remote 2001:db8:3::1    | |
+# | |    id 1010                                    id 1020                 | |
+# | |    dstport 4789                               dstport 4789            | |
+# | |    vid 10 pvid untagged                       vid 20 pvid untagged    | |
+# | |                                                                       | |
+# | |                             + vx4001                                  | |
+# | |                               local 2001:db8:3::2                     | |
+# | |                               remote 2001:db8:3::1                    | |
+# | |                               id 104001                               | |
+# | |                               dstport 4789                            | |
+# | |                               vid 4001 pvid untagged                  | |
+# | |                                                                       | |
+# | |  + w1 (veth)                                + w3 (veth)               | |
+# | |  | vid 10 pvid untagged          br1        | vid 20 pvid untagged    | |
+# | +--|------------------------------------------|-------------------------+ |
+# |    |                                          |                           |
+# |    |                                          |                           |
+# | +--|----------------------+                +--|-------------------------+ |
+# | |  |               vrf-h1 |                |  |                  vrf-h2 | |
+# | |  + w2 (veth)            |                |  + w4 (veth)               | |
+# | |    2001:db8:1::4/64     |                |    2001:db8:2::4/64        | |
+# | |    default via          |                |    default via             | |
+# | |    2001:db8:1::3/64     |                |    2001:db8:2::3/64        | |
+# | +-------------------------+                +----------------------------+ |
+# +---------------------------------------------------------------------------+
+
+ALL_TESTS="
+	ping_ipv6
+"
+NUM_NETIFS=6
+source lib.sh
+
+hx_create()
+{
+	local vrf_name=$1; shift
+	local if_name=$1; shift
+	local ip_addr=$1; shift
+	local gw_ip=$1; shift
+
+	vrf_create $vrf_name
+	ip link set dev $if_name master $vrf_name
+	ip link set dev $vrf_name up
+	ip link set dev $if_name up
+
+	ip address add $ip_addr/64 dev $if_name
+	ip neigh replace $gw_ip lladdr 00:00:5e:00:01:01 nud permanent \
+		dev $if_name
+	ip route add default vrf $vrf_name nexthop via $gw_ip
+}
+export -f hx_create
+
+hx_destroy()
+{
+	local vrf_name=$1; shift
+	local if_name=$1; shift
+	local ip_addr=$1; shift
+	local gw_ip=$1; shift
+
+	ip route del default vrf $vrf_name nexthop via $gw_ip
+	ip neigh del $gw_ip dev $if_name
+	ip address del $ip_addr/64 dev $if_name
+
+	ip link set dev $if_name down
+	vrf_destroy $vrf_name
+}
+
+h1_create()
+{
+	hx_create "vrf-h1" $h1 2001:db8:1::1 2001:db8:1::3
+}
+
+h1_destroy()
+{
+	hx_destroy "vrf-h1" $h1 2001:db8:1::1 2001:db8:1::3
+}
+
+h2_create()
+{
+	hx_create "vrf-h2" $h2 2001:db8:2::1 2001:db8:2::3
+}
+
+h2_destroy()
+{
+	hx_destroy "vrf-h2" $h2 2001:db8:2::1 2001:db8:2::3
+}
+
+switch_create()
+{
+	ip link add name br1 type bridge vlan_filtering 1 vlan_default_pvid 0 \
+		mcast_snooping 0
+	# Make sure the bridge uses the MAC address of the local port and not
+	# that of the VxLAN's device.
+	ip link set dev br1 address $(mac_get $swp1)
+	ip link set dev br1 up
+
+	ip link set dev $rp1 up
+	ip address add dev $rp1 2001:db8:4::1/64
+	ip route add 2001:db8:3::2/128 nexthop via 2001:db8:4::2
+
+	ip link add name vx10 type vxlan id 1010		\
+		local 2001:db8:3::1 remote 2001:db8:3::2 dstport 4789	\
+		nolearning udp6zerocsumrx udp6zerocsumtx tos inherit ttl 100
+	ip link set dev vx10 up
+
+	ip link set dev vx10 master br1
+	bridge vlan add vid 10 dev vx10 pvid untagged
+
+	ip link add name vx20 type vxlan id 1020		\
+		local 2001:db8:3::1 remote 2001:db8:3::2 dstport 4789	\
+		nolearning udp6zerocsumrx udp6zerocsumtx tos inherit ttl 100
+	ip link set dev vx20 up
+
+	ip link set dev vx20 master br1
+	bridge vlan add vid 20 dev vx20 pvid untagged
+
+	ip link set dev $swp1 master br1
+	ip link set dev $swp1 up
+
+	ip link set dev $swp2 master br1
+	ip link set dev $swp2 up
+
+	ip link add name vx4001 type vxlan id 104001		\
+		local 2001:db8:3::1 dstport 4789			\
+		nolearning udp6zerocsumrx udp6zerocsumtx tos inherit ttl 100
+	ip link set dev vx4001 up
+
+	ip link set dev vx4001 master br1
+	bridge vlan add vid 4001 dev vx4001 pvid untagged
+
+	ip address add 2001:db8:3::1/128 dev lo
+
+	# Create SVIs
+	vrf_create "vrf-green"
+	ip link set dev vrf-green up
+
+	ip link add link br1 name vlan10 up master vrf-green type vlan id 10
+	ip address add 2001:db8:1::2/64 dev vlan10
+	ip link add link vlan10 name vlan10-v up master vrf-green \
+		address 00:00:5e:00:01:01 type macvlan mode private
+	ip address add 2001:db8:1::3/64 dev vlan10-v
+
+	ip link add link br1 name vlan20 up master vrf-green type vlan id 20
+	ip address add 2001:db8:2::2/64 dev vlan20
+	ip link add link vlan20 name vlan20-v up master vrf-green \
+		address 00:00:5e:00:01:01 type macvlan mode private
+	ip address add 2001:db8:2::3/64 dev vlan20-v
+
+	ip link add link br1 name vlan4001 up master vrf-green \
+		type vlan id 4001
+
+	bridge vlan add vid 10 dev br1 self
+	bridge vlan add vid 20 dev br1 self
+	bridge vlan add vid 4001 dev br1 self
+
+	bridge fdb add 00:00:5e:00:01:01 dev br1 self local vlan 10
+	bridge fdb add 00:00:5e:00:01:01 dev br1 self local vlan 20
+
+	bridge vlan add vid 10 dev $swp1 pvid untagged
+	bridge vlan add vid 20 dev $swp2 pvid untagged
+}
+
+switch_destroy()
+{
+	bridge vlan del vid 20 dev br1 self
+	bridge vlan del vid 10 dev br1 self
+
+	bridge fdb del 00:00:5e:00:01:01 dev br1 self local vlan 20
+	bridge fdb del 00:00:5e:00:01:01 dev br1 self local vlan 10
+
+	bridge vlan del vid 4001 dev br1 self
+	ip link del dev vlan4001
+
+	ip link del dev vlan20
+
+	ip link del dev vlan10
+
+	vrf_destroy "vrf-green"
+
+	ip address del 2001:db8:3::1/128 dev lo
+
+	bridge vlan del vid 20 dev $swp2
+	ip link set dev $swp2 down
+	ip link set dev $swp2 nomaster
+
+	bridge vlan del vid 10 dev $swp1
+	ip link set dev $swp1 down
+	ip link set dev $swp1 nomaster
+
+	bridge vlan del vid 4001 dev vx4001
+	ip link set dev vx4001 nomaster
+
+	ip link set dev vx4001 down
+	ip link del dev vx4001
+
+	bridge vlan del vid 20 dev vx20
+	ip link set dev vx20 nomaster
+
+	ip link set dev vx20 down
+	ip link del dev vx20
+
+	bridge vlan del vid 10 dev vx10
+	ip link set dev vx10 nomaster
+
+	ip link set dev vx10 down
+	ip link del dev vx10
+
+	ip route del 2001:db8:3::2 nexthop via 2001:db8:4::2
+	ip address del dev $rp1 2001:db8:4::1/64
+	ip link set dev $rp1 down
+
+	ip link set dev br1 down
+	ip link del dev br1
+}
+
+spine_create()
+{
+	vrf_create "vrf-spine"
+	ip link set dev $rp2 master vrf-spine
+	ip link set dev v1 master vrf-spine
+	ip link set dev vrf-spine up
+	ip link set dev $rp2 up
+	ip link set dev v1 up
+
+	ip address add 2001:db8:4::2/64 dev $rp2
+	ip address add 2001:db8:5::2/64 dev v1
+
+	ip route add 2001:db8:3::1/128 vrf vrf-spine nexthop via \
+		2001:db8:4::1
+	ip route add 2001:db8:3::2/128 vrf vrf-spine nexthop via \
+		2001:db8:5::1
+}
+
+spine_destroy()
+{
+	ip route del 2001:db8:3::2/128 vrf vrf-spine nexthop via \
+		2001:db8:5::1
+	ip route del 2001:db8:3::1/128 vrf vrf-spine nexthop via \
+		2001:db8:4::1
+
+	ip address del 2001:db8:5::2/64 dev v1
+	ip address del 2001:db8:4::2/64 dev $rp2
+
+	ip link set dev v1 down
+	ip link set dev $rp2 down
+	vrf_destroy "vrf-spine"
+}
+
+ns_h1_create()
+{
+	hx_create "vrf-h1" w2 2001:db8:1::4 2001:db8:1::3
+}
+export -f ns_h1_create
+
+ns_h2_create()
+{
+	hx_create "vrf-h2" w4 2001:db8:2::4 2001:db8:2::3
+}
+export -f ns_h2_create
+
+ns_switch_create()
+{
+	ip link add name br1 type bridge vlan_filtering 1 vlan_default_pvid 0 \
+		mcast_snooping 0
+	ip link set dev br1 up
+
+	ip link set dev v2 up
+	ip address add dev v2 2001:db8:5::1/64
+	ip route add 2001:db8:3::1 nexthop via 2001:db8:5::2
+
+	ip link add name vx10 type vxlan id 1010		\
+		local 2001:db8:3::2 remote 2001:db8:3::1 dstport 4789	\
+		nolearning udp6zerocsumrx udp6zerocsumtx tos inherit ttl 100
+	ip link set dev vx10 up
+
+	ip link set dev vx10 master br1
+	bridge vlan add vid 10 dev vx10 pvid untagged
+
+	ip link add name vx20 type vxlan id 1020		\
+		local 2001:db8:3::2 remote 2001:db8:3::1 dstport 4789	\
+		nolearning udp6zerocsumrx udp6zerocsumtx tos inherit ttl 100
+	ip link set dev vx20 up
+
+	ip link set dev vx20 master br1
+	bridge vlan add vid 20 dev vx20 pvid untagged
+
+	ip link add name vx4001 type vxlan id 104001		\
+		local 2001:db8:3::2 dstport 4789			\
+		nolearning udp6zerocsumrx udp6zerocsumtx tos inherit ttl 100
+	ip link set dev vx4001 up
+
+	ip link set dev vx4001 master br1
+	bridge vlan add vid 4001 dev vx4001 pvid untagged
+
+	ip link set dev w1 master br1
+	ip link set dev w1 up
+	bridge vlan add vid 10 dev w1 pvid untagged
+
+	ip link set dev w3 master br1
+	ip link set dev w3 up
+	bridge vlan add vid 20 dev w3 pvid untagged
+
+	ip address add 2001:db8:3::2/128 dev lo
+
+	# Create SVIs
+	vrf_create "vrf-green"
+	ip link set dev vrf-green up
+
+	ip link add link br1 name vlan10 up master vrf-green type vlan id 10
+	ip address add 2001:db8:1::3/64 dev vlan10
+	ip link add link vlan10 name vlan10-v up master vrf-green \
+		address 00:00:5e:00:01:01 type macvlan mode private
+	ip address add 2001:db8:1::3/64 dev vlan10-v
+
+	ip link add link br1 name vlan20 up master vrf-green type vlan id 20
+	ip address add 2001:db8:2::3/64 dev vlan20
+	ip link add link vlan20 name vlan20-v up master vrf-green \
+		address 00:00:5e:00:01:01 type macvlan mode private
+	ip address add 2001:db8:2::3/64 dev vlan20-v
+
+	ip link add link br1 name vlan4001 up master vrf-green \
+		type vlan id 4001
+
+	bridge vlan add vid 10 dev br1 self
+	bridge vlan add vid 20 dev br1 self
+	bridge vlan add vid 4001 dev br1 self
+
+	bridge fdb add 00:00:5e:00:01:01 dev br1 self local vlan 10
+	bridge fdb add 00:00:5e:00:01:01 dev br1 self local vlan 20
+}
+export -f ns_switch_create
+
+ns_init()
+{
+	ip link add name w1 type veth peer name w2
+	ip link add name w3 type veth peer name w4
+
+	ip link set dev lo up
+
+	ns_h1_create
+	ns_h2_create
+	ns_switch_create
+}
+export -f ns_init
+
+ns1_create()
+{
+	ip netns add ns1
+	ip link set dev v2 netns ns1
+	in_ns ns1 ns_init
+}
+
+ns1_destroy()
+{
+	ip netns exec ns1 ip link set dev v2 netns 1
+	ip netns del ns1
+}
+
+__l2_vni_init()
+{
+	local mac1=$1; shift
+	local mac2=$1; shift
+	local ip1=$1; shift
+	local ip2=$1; shift
+	local dst=$1; shift
+
+	bridge fdb add $mac1 dev vx10 self master extern_learn static \
+		dst $dst vlan 10
+	bridge fdb add $mac2 dev vx20 self master extern_learn static \
+		dst $dst vlan 20
+
+	ip neigh add $ip1 lladdr $mac1 nud noarp dev vlan10 \
+		extern_learn
+	ip neigh add $ip2 lladdr $mac2 nud noarp dev vlan20 \
+		extern_learn
+}
+export -f __l2_vni_init
+
+l2_vni_init()
+{
+	local h1_ns_mac=$(in_ns ns1 mac_get w2)
+	local h2_ns_mac=$(in_ns ns1 mac_get w4)
+	local h1_mac=$(mac_get $h1)
+	local h2_mac=$(mac_get $h2)
+
+	__l2_vni_init $h1_ns_mac $h2_ns_mac 2001:db8:1::4 2001:db8:2::4 \
+	       2001:db8:3::2
+	in_ns ns1 __l2_vni_init $h1_mac $h2_mac 2001:db8:1::1 2001:db8:2::1 \
+	       2001:db8:3::1
+}
+
+__l3_vni_init()
+{
+	local mac=$1; shift
+	local vtep_ip=$1; shift
+	local host1_ip=$1; shift
+	local host2_ip=$1; shift
+
+	bridge fdb add $mac dev vx4001 self master extern_learn static \
+		dst $vtep_ip vlan 4001
+
+	ip neigh add $vtep_ip lladdr $mac nud noarp dev vlan4001 extern_learn
+
+	ip route add $host1_ip/128 vrf vrf-green nexthop via $vtep_ip \
+		dev vlan4001 onlink
+	ip route add $host2_ip/128 vrf vrf-green nexthop via $vtep_ip \
+		dev vlan4001 onlink
+}
+export -f __l3_vni_init
+
+l3_vni_init()
+{
+	local vlan4001_ns_mac=$(in_ns ns1 mac_get vlan4001)
+	local vlan4001_mac=$(mac_get vlan4001)
+
+	__l3_vni_init $vlan4001_ns_mac 2001:db8:3::2 2001:db8:1::4 \
+		2001:db8:2::4
+	in_ns ns1 __l3_vni_init $vlan4001_mac 2001:db8:3::1 2001:db8:1::1 \
+		2001:db8:2::1
+}
+
+setup_prepare()
+{
+	h1=${NETIFS[p1]}
+	swp1=${NETIFS[p2]}
+
+	swp2=${NETIFS[p3]}
+	h2=${NETIFS[p4]}
+
+	rp1=${NETIFS[p5]}
+	rp2=${NETIFS[p6]}
+
+	vrf_prepare
+	forwarding_enable
+
+	h1_create
+	h2_create
+	switch_create
+
+	ip link add name v1 type veth peer name v2
+	spine_create
+	ns1_create
+	in_ns ns1 forwarding_enable
+
+	l2_vni_init
+	l3_vni_init
+}
+
+cleanup()
+{
+	pre_cleanup
+
+	ns1_destroy
+	spine_destroy
+	ip link del dev v1
+
+	switch_destroy
+	h2_destroy
+	h1_destroy
+
+	forwarding_restore
+	vrf_cleanup
+}
+
+ping_ipv6()
+{
+	ping6_test $h1 2001:db8:2::1 ": local->local vid 10->vid 20"
+	ping6_test $h1 2001:db8:1::4 ": local->remote vid 10->vid 10"
+	ping6_test $h2 2001:db8:2::4 ": local->remote vid 20->vid 20"
+	ping6_test $h1 2001:db8:2::4 ": local->remote vid 10->vid 20"
+	ping6_test $h2 2001:db8:1::4 ": local->remote vid 20->vid 10"
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
-- 
cgit v1.2.3


From bf0a8b9bf2c3e36923982943b26919467c281199 Mon Sep 17 00:00:00 2001
From: Amit Cohen <amcohen@nvidia.com>
Date: Tue, 21 Dec 2021 16:49:49 +0200
Subject: selftests: forwarding: Add Q-in-VNI test for IPv6

Add test to check Q-in-VNI traffic with IPv6 underlay and overlay.
The test is similar to the existing IPv4 test.

Signed-off-by: Amit Cohen <amcohen@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../selftests/net/forwarding/q_in_vni_ipv6.sh      | 347 +++++++++++++++++++++
 1 file changed, 347 insertions(+)
 create mode 100755 tools/testing/selftests/net/forwarding/q_in_vni_ipv6.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/q_in_vni_ipv6.sh b/tools/testing/selftests/net/forwarding/q_in_vni_ipv6.sh
new file mode 100755
index 000000000000..0548b2b0d416
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/q_in_vni_ipv6.sh
@@ -0,0 +1,347 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# +-----------------------+                          +------------------------+
+# | H1 (vrf)              |                          | H2 (vrf)               |
+# |  + $h1.10             |                          |  + $h2.10              |
+# |  | 2001:db8:1::1/64   |                          |  | 2001:db8:1::2/64    |
+# |  |                    |                          |  |                     |
+# |  | + $h1.20           |                          |  | + $h2.20            |
+# |  \ | 2001:db8:2::1/64 |                          |  \ | 2001:db8:2::2/64  |
+# |   \|                  |                          |   \|                   |
+# |    + $h1              |                          |    + $h2               |
+# +----|------------------+                          +----|-------------------+
+#      |                                                  |
+# +----|--------------------------------------------------|-------------------+
+# | SW |                                                  |                   |
+# | +--|--------------------------------------------------|-----------------+ |
+# | |  + $swp1                   BR1 (802.1ad)            + $swp2           | |
+# | |    vid 100 pvid untagged                              vid 100 pvid    | |
+# | |                                                           untagged    | |
+# | |  + vx100 (vxlan)                                                      | |
+# | |    local 2001:db8:3::1                                                | |
+# | |    remote 2001:db8:4::1 2001:db8:5::1                                 | |
+# | |    id 1000 dstport $VXPORT                                            | |
+# | |    vid 100 pvid untagged                                              | |
+# | +-----------------------------------------------------------------------+ |
+# |                                                                           |
+# |  2001:db8:4::0/64 via 2001:db8:3::2                                       |
+# |  2001:db8:5::0/64 via 2001:db8:3::2                                       |
+# |                                                                           |
+# |    + $rp1                                                                 |
+# |    | 2001:db8:3::1/64                                                     |
+# +----|----------------------------------------------------------------------+
+#      |
+# +----|----------------------------------------------------------+
+# |    |                                             VRP2 (vrf)   |
+# |    + $rp2                                                     |
+# |      2001:db8:3::2/64                                         |
+# |                                                               |   (maybe) HW
+# =============================================================================
+# |                                                               | (likely) SW
+# |    + v1 (veth)                             + v3 (veth)        |
+# |    | 2001:db8:4::2/64                      | 2001:db8:5::2/64 |
+# +----|---------------------------------------|------------------+
+#      |                                       |
+# +----|--------------------------------+ +----|-------------------------------+
+# |    + v2 (veth)        NS1 (netns)   | |    + v4 (veth)        NS2 (netns)  |
+# |      2001:db8:4::1/64               | |      2001:db8:5::1/64              |
+# |                                     | |                                    |
+# | 2001:db8:3::0/64 via 2001:db8:4::2  | | 2001:db8:3::0/64 via 2001:db8:5::2 |
+# | 2001:db8:5::1/128 via 2001:db8:4::2 | | 2001:db8:4::1/128 via              |
+# |                                     | |           2001:db8:5::2            |
+# | +-------------------------------+   | | +-------------------------------+  |
+# | |                 BR2 (802.1ad) |   | | |                 BR2 (802.1ad) |  |
+# | |  + vx100 (vxlan)              |   | | |  + vx100 (vxlan)              |  |
+# | |    local 2001:db8:4::1        |   | | |    local 2001:db8:5::1        |  |
+# | |    remote 2001:db8:3::1       |   | | |    remote 2001:db8:3::1       |  |
+# | |    remote 2001:db8:5::1       |   | | |    remote 2001:db8:4::1       |  |
+# | |    id 1000 dstport $VXPORT    |   | | |    id 1000 dstport $VXPORT    |  |
+# | |    vid 100 pvid untagged      |   | | |    vid 100 pvid untagged      |  |
+# | |                               |   | | |                               |  |
+# | |  + w1 (veth)                  |   | | |  + w1 (veth)                  |  |
+# | |  | vid 100 pvid untagged      |   | | |  | vid 100 pvid untagged      |  |
+# | +--|----------------------------+   | | +--|----------------------------+  |
+# |    |                                | |    |                               |
+# | +--|----------------------------+   | | +--|----------------------------+  |
+# | |  |                  VW2 (vrf) |   | | |  |                  VW2 (vrf) |  |
+# | |  + w2 (veth)                  |   | | |  + w2 (veth)                  |  |
+# | |  |\                           |   | | |  |\                           |  |
+# | |  | + w2.10                    |   | | |  | + w2.10                    |  |
+# | |  |   2001:db8:1::3/64         |   | | |  |   2001:db8:1::4/64         |  |
+# | |  |                            |   | | |  |                            |  |
+# | |  + w2.20                      |   | | |  + w2.20                      |  |
+# | |    2001:db8:2::3/64           |   | | |    2001:db8:2::4/64           |  |
+# | +-------------------------------+   | | +-------------------------------+  |
+# +-------------------------------------+ +------------------------------------+
+
+: ${VXPORT:=4789}
+export VXPORT
+
+: ${ALL_TESTS:="
+	ping_ipv6
+    "}
+
+NUM_NETIFS=6
+source lib.sh
+
+h1_create()
+{
+	simple_if_init $h1
+	tc qdisc add dev $h1 clsact
+	vlan_create $h1 10 v$h1 2001:db8:1::1/64
+	vlan_create $h1 20 v$h1 2001:db8:2::1/64
+}
+
+h1_destroy()
+{
+	vlan_destroy $h1 20
+	vlan_destroy $h1 10
+	tc qdisc del dev $h1 clsact
+	simple_if_fini $h1
+}
+
+h2_create()
+{
+	simple_if_init $h2
+	tc qdisc add dev $h2 clsact
+	vlan_create $h2 10 v$h2 2001:db8:1::2/64
+	vlan_create $h2 20 v$h2 2001:db8:2::2/64
+}
+
+h2_destroy()
+{
+	vlan_destroy $h2 20
+	vlan_destroy $h2 10
+	tc qdisc del dev $h2 clsact
+	simple_if_fini $h2
+}
+
+rp1_set_addr()
+{
+	ip address add dev $rp1 2001:db8:3::1/64
+
+	ip route add 2001:db8:4::0/64 nexthop via 2001:db8:3::2
+	ip route add 2001:db8:5::0/64 nexthop via 2001:db8:3::2
+}
+
+rp1_unset_addr()
+{
+	ip route del 2001:db8:5::0/64 nexthop via 2001:db8:3::2
+	ip route del 2001:db8:4::0/64 nexthop via 2001:db8:3::2
+
+	ip address del dev $rp1 2001:db8:3::1/64
+}
+
+switch_create()
+{
+	ip link add name br1 type bridge vlan_filtering 1 vlan_protocol 802.1ad \
+		vlan_default_pvid 0 mcast_snooping 0
+	# Make sure the bridge uses the MAC address of the local port and not
+	# that of the VxLAN's device.
+	ip link set dev br1 address $(mac_get $swp1)
+	ip link set dev br1 up
+
+	ip link set dev $rp1 up
+	rp1_set_addr
+
+	ip link add name vx100 type vxlan id 1000	\
+		local 2001:db8:3::1 dstport "$VXPORT"	\
+		nolearning udp6zerocsumrx udp6zerocsumtx tos inherit ttl 100
+	ip link set dev vx100 up
+
+	ip link set dev vx100 master br1
+	bridge vlan add vid 100 dev vx100 pvid untagged
+
+	ip link set dev $swp1 master br1
+	ip link set dev $swp1 up
+	bridge vlan add vid 100 dev $swp1 pvid untagged
+
+	ip link set dev $swp2 master br1
+	ip link set dev $swp2 up
+	bridge vlan add vid 100 dev $swp2 pvid untagged
+
+	bridge fdb append dev vx100 00:00:00:00:00:00 dst 2001:db8:4::1 self
+	bridge fdb append dev vx100 00:00:00:00:00:00 dst 2001:db8:5::1 self
+}
+
+switch_destroy()
+{
+	bridge fdb del dev vx100 00:00:00:00:00:00 dst 2001:db8:5::1 self
+	bridge fdb del dev vx100 00:00:00:00:00:00 dst 2001:db8:4::1 self
+
+	bridge vlan del vid 100 dev $swp2
+	ip link set dev $swp2 down
+	ip link set dev $swp2 nomaster
+
+	bridge vlan del vid 100 dev $swp1
+	ip link set dev $swp1 down
+	ip link set dev $swp1 nomaster
+
+	ip link set dev vx100 nomaster
+	ip link set dev vx100 down
+	ip link del dev vx100
+
+	rp1_unset_addr
+	ip link set dev $rp1 down
+
+	ip link set dev br1 down
+	ip link del dev br1
+}
+
+vrp2_create()
+{
+	simple_if_init $rp2 2001:db8:3::2/64
+	__simple_if_init v1 v$rp2 2001:db8:4::2/64
+	__simple_if_init v3 v$rp2 2001:db8:5::2/64
+	tc qdisc add dev v1 clsact
+}
+
+vrp2_destroy()
+{
+	tc qdisc del dev v1 clsact
+	__simple_if_fini v3 2001:db8:5::2/64
+	__simple_if_fini v1 2001:db8:4::2/64
+	simple_if_fini $rp2 2001:db8:3::2/64
+}
+
+ns_init_common()
+{
+	local in_if=$1; shift
+	local in_addr=$1; shift
+	local other_in_addr=$1; shift
+	local nh_addr=$1; shift
+	local host_addr1=$1; shift
+	local host_addr2=$1; shift
+
+	ip link set dev $in_if up
+	ip address add dev $in_if $in_addr/64
+	tc qdisc add dev $in_if clsact
+
+	ip link add name br2 type bridge vlan_filtering 1 vlan_protocol 802.1ad \
+		vlan_default_pvid 0
+	ip link set dev br2 up
+
+	ip link add name w1 type veth peer name w2
+
+	ip link set dev w1 master br2
+	ip link set dev w1 up
+	bridge vlan add vid 100 dev w1 pvid untagged
+
+	ip link add name vx100 type vxlan id 1000 local $in_addr \
+		dstport "$VXPORT" udp6zerocsumrx
+	ip link set dev vx100 up
+	bridge fdb append dev vx100 00:00:00:00:00:00 dst 2001:db8:3::1 self
+	bridge fdb append dev vx100 00:00:00:00:00:00 dst $other_in_addr self
+
+	ip link set dev vx100 master br2
+	tc qdisc add dev vx100 clsact
+
+	bridge vlan add vid 100 dev vx100 pvid untagged
+
+	simple_if_init w2
+        vlan_create w2 10 vw2 $host_addr1/64
+        vlan_create w2 20 vw2 $host_addr2/64
+
+	ip route add 2001:db8:3::0/64 nexthop via $nh_addr
+	ip route add $other_in_addr/128 nexthop via $nh_addr
+}
+export -f ns_init_common
+
+ns1_create()
+{
+	ip netns add ns1
+	ip link set dev v2 netns ns1
+	in_ns ns1 \
+	      ns_init_common v2 2001:db8:4::1 2001:db8:5::1 2001:db8:4::2 \
+			    2001:db8:1::3 2001:db8:2::3
+}
+
+ns1_destroy()
+{
+	ip netns exec ns1 ip link set dev v2 netns 1
+	ip netns del ns1
+}
+
+ns2_create()
+{
+	ip netns add ns2
+	ip link set dev v4 netns ns2
+	in_ns ns2 \
+	      ns_init_common v4 2001:db8:5::1 2001:db8:4::1 2001:db8:5::2 \
+			     2001:db8:1::4 2001:db8:2::4
+}
+
+ns2_destroy()
+{
+	ip netns exec ns2 ip link set dev v4 netns 1
+	ip netns del ns2
+}
+
+setup_prepare()
+{
+	h1=${NETIFS[p1]}
+	swp1=${NETIFS[p2]}
+
+	swp2=${NETIFS[p3]}
+	h2=${NETIFS[p4]}
+
+	rp1=${NETIFS[p5]}
+	rp2=${NETIFS[p6]}
+
+	vrf_prepare
+	forwarding_enable
+
+	h1_create
+	h2_create
+	switch_create
+
+	ip link add name v1 type veth peer name v2
+	ip link add name v3 type veth peer name v4
+	vrp2_create
+	ns1_create
+	ns2_create
+
+	r1_mac=$(in_ns ns1 mac_get w2)
+	r2_mac=$(in_ns ns2 mac_get w2)
+	h2_mac=$(mac_get $h2)
+}
+
+cleanup()
+{
+	pre_cleanup
+
+	ns2_destroy
+	ns1_destroy
+	vrp2_destroy
+	ip link del dev v3
+	ip link del dev v1
+
+	switch_destroy
+	h2_destroy
+	h1_destroy
+
+	forwarding_restore
+	vrf_cleanup
+}
+
+ping_ipv6()
+{
+	ping6_test $h1 2001:db8:1::2 ": local->local"
+	ping6_test $h1 2001:db8:1::3 ": local->remote 1"
+	ping6_test $h1 2001:db8:1::4 ": local->remote 2"
+}
+
+test_all()
+{
+	echo "Running tests with UDP port $VXPORT"
+	tests_run
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+test_all
+
+exit $EXIT_STATUS
-- 
cgit v1.2.3


From 8e059d64bee4a71064423ed34e04868cff478678 Mon Sep 17 00:00:00 2001
From: Amit Cohen <amcohen@nvidia.com>
Date: Thu, 23 Dec 2021 09:29:55 +0200
Subject: selftests: mlxsw: vxlan: Make the test more flexible for future use

vxlan.sh cases are dedicated to test VxLAN with IPv4 underlay.
The main changes to test IPv6 underlay are IP addresses and some flags.

Add variables to define all the values which supposed to be different
for IPv6 testing, set them to use the existing values by default.

The next patch will define the new added variables in a separated file,
so the same tests can be used for IPv6 also.

Rename some functions to include "ipv4", so the next patch will add
equivalent functions for IPv6.

Signed-off-by: Amit Cohen <amcohen@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/drivers/net/mlxsw/vxlan.sh | 224 ++++++++++++---------
 1 file changed, 125 insertions(+), 99 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/mlxsw/vxlan.sh b/tools/testing/selftests/drivers/net/mlxsw/vxlan.sh
index 3639b89c81ba..99a332b712f0 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/vxlan.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/vxlan.sh
@@ -4,10 +4,35 @@
 # Test various aspects of VxLAN offloading which are specific to mlxsw, such
 # as sanitization of invalid configurations and offload indication.
 
-lib_dir=$(dirname $0)/../../../net/forwarding
+: ${ADDR_FAMILY:=ipv4}
+export ADDR_FAMILY
+
+: ${LOCAL_IP_1:=198.51.100.1}
+export LOCAL_IP_1
+
+: ${LOCAL_IP_2:=198.51.100.2}
+export LOCAL_IP_2
+
+: ${PREFIX_LEN:=32}
+export PREFIX_LEN
+
+: ${UDPCSUM_FLAFS:=noudpcsum}
+export UDPCSUM_FLAFS
 
-ALL_TESTS="sanitization_test offload_indication_test \
-	sanitization_vlan_aware_test offload_indication_vlan_aware_test"
+: ${MC_IP:=239.0.0.1}
+export MC_IP
+
+: ${IP_FLAG:=""}
+export IP_FLAG
+
+: ${ALL_TESTS:="
+	sanitization_test
+	offload_indication_test
+	sanitization_vlan_aware_test
+	offload_indication_vlan_aware_test
+"}
+
+lib_dir=$(dirname $0)/../../../net/forwarding
 NUM_NETIFS=2
 : ${TIMEOUT:=20000} # ms
 source $lib_dir/lib.sh
@@ -63,8 +88,8 @@ sanitization_single_dev_valid_test()
 
 	ip link add dev br0 type bridge mcast_snooping 0
 
-	ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
-		ttl 20 tos inherit local 198.51.100.1 dstport 4789
+	ip link add name vxlan0 up type vxlan id 10 nolearning $UDPCSUM_FLAFS \
+		ttl 20 tos inherit local $LOCAL_IP_1 dstport 4789
 
 	sanitization_single_dev_test_pass
 
@@ -80,8 +105,8 @@ sanitization_single_dev_vlan_aware_test()
 
 	ip link add dev br0 type bridge mcast_snooping 0 vlan_filtering 1
 
-	ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
-		ttl 20 tos inherit local 198.51.100.1 dstport 4789
+	ip link add name vxlan0 up type vxlan id 10 nolearning $UDPCSUM_FLAFS \
+		ttl 20 tos inherit local $LOCAL_IP_1 dstport 4789
 
 	sanitization_single_dev_test_pass
 
@@ -97,8 +122,8 @@ sanitization_single_dev_mcast_enabled_test()
 
 	ip link add dev br0 type bridge
 
-	ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
-		ttl 20 tos inherit local 198.51.100.1 dstport 4789
+	ip link add name vxlan0 up type vxlan id 10 nolearning $UDPCSUM_FLAFS \
+		ttl 20 tos inherit local $LOCAL_IP_1 dstport 4789
 
 	sanitization_single_dev_test_fail
 
@@ -115,9 +140,9 @@ sanitization_single_dev_mcast_group_test()
 	ip link add dev br0 type bridge mcast_snooping 0
 	ip link add name dummy1 up type dummy
 
-	ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
-		ttl 20 tos inherit local 198.51.100.1 dstport 4789 \
-		dev dummy1 group 239.0.0.1
+	ip link add name vxlan0 up type vxlan id 10 nolearning $UDPCSUM_FLAFS \
+		ttl 20 tos inherit local $LOCAL_IP_1 dstport 4789 \
+		dev dummy1 group $MC_IP
 
 	sanitization_single_dev_test_fail
 
@@ -134,7 +159,7 @@ sanitization_single_dev_no_local_ip_test()
 
 	ip link add dev br0 type bridge mcast_snooping 0
 
-	ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
+	ip link add name vxlan0 up type vxlan id 10 nolearning $UDPCSUM_FLAFS \
 		ttl 20 tos inherit dstport 4789
 
 	sanitization_single_dev_test_fail
@@ -145,14 +170,14 @@ sanitization_single_dev_no_local_ip_test()
 	log_test "vxlan device with no local ip"
 }
 
-sanitization_single_dev_learning_enabled_test()
+sanitization_single_dev_learning_enabled_ipv4_test()
 {
 	RET=0
 
 	ip link add dev br0 type bridge mcast_snooping 0
 
-	ip link add name vxlan0 up type vxlan id 10 learning noudpcsum \
-		ttl 20 tos inherit local 198.51.100.1 dstport 4789
+	ip link add name vxlan0 up type vxlan id 10 learning $UDPCSUM_FLAFS \
+		ttl 20 tos inherit local $LOCAL_IP_1 dstport 4789
 
 	sanitization_single_dev_test_pass
 
@@ -169,8 +194,8 @@ sanitization_single_dev_local_interface_test()
 	ip link add dev br0 type bridge mcast_snooping 0
 	ip link add name dummy1 up type dummy
 
-	ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
-		ttl 20 tos inherit local 198.51.100.1 dstport 4789 dev dummy1
+	ip link add name vxlan0 up type vxlan id 10 nolearning $UDPCSUM_FLAFS \
+		ttl 20 tos inherit local $LOCAL_IP_1 dstport 4789 dev dummy1
 
 	sanitization_single_dev_test_fail
 
@@ -187,8 +212,8 @@ sanitization_single_dev_port_range_test()
 
 	ip link add dev br0 type bridge mcast_snooping 0
 
-	ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
-		ttl 20 tos inherit local 198.51.100.1 dstport 4789 \
+	ip link add name vxlan0 up type vxlan id 10 nolearning $UDPCSUM_FLAFS \
+		ttl 20 tos inherit local $LOCAL_IP_1 dstport 4789 \
 		srcport 4000 5000
 
 	sanitization_single_dev_test_fail
@@ -205,8 +230,8 @@ sanitization_single_dev_tos_static_test()
 
 	ip link add dev br0 type bridge mcast_snooping 0
 
-	ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
-		ttl 20 tos 20 local 198.51.100.1 dstport 4789
+	ip link add name vxlan0 up type vxlan id 10 nolearning $UDPCSUM_FLAFS \
+		ttl 20 tos 20 local $LOCAL_IP_1 dstport 4789
 
 	sanitization_single_dev_test_fail
 
@@ -222,8 +247,8 @@ sanitization_single_dev_ttl_inherit_test()
 
 	ip link add dev br0 type bridge mcast_snooping 0
 
-	ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
-		ttl inherit tos inherit local 198.51.100.1 dstport 4789
+	ip link add name vxlan0 up type vxlan id 10 nolearning $UDPCSUM_FLAFS \
+		ttl inherit tos inherit local $LOCAL_IP_1 dstport 4789
 
 	sanitization_single_dev_test_fail
 
@@ -233,14 +258,14 @@ sanitization_single_dev_ttl_inherit_test()
 	log_test "vxlan device with inherit ttl"
 }
 
-sanitization_single_dev_udp_checksum_test()
+sanitization_single_dev_udp_checksum_ipv4_test()
 {
 	RET=0
 
 	ip link add dev br0 type bridge mcast_snooping 0
 
 	ip link add name vxlan0 up type vxlan id 10 nolearning udpcsum \
-		ttl 20 tos inherit local 198.51.100.1 dstport 4789
+		ttl 20 tos inherit local $LOCAL_IP_1 dstport 4789
 
 	sanitization_single_dev_test_fail
 
@@ -259,12 +284,12 @@ sanitization_single_dev_test()
 	sanitization_single_dev_mcast_enabled_test
 	sanitization_single_dev_mcast_group_test
 	sanitization_single_dev_no_local_ip_test
-	sanitization_single_dev_learning_enabled_test
+	sanitization_single_dev_learning_enabled_"$ADDR_FAMILY"_test
 	sanitization_single_dev_local_interface_test
 	sanitization_single_dev_port_range_test
 	sanitization_single_dev_tos_static_test
 	sanitization_single_dev_ttl_inherit_test
-	sanitization_single_dev_udp_checksum_test
+	sanitization_single_dev_udp_checksum_"$ADDR_FAMILY"_test
 }
 
 sanitization_multi_devs_test_pass()
@@ -316,10 +341,10 @@ sanitization_multi_devs_valid_test()
 	ip link add dev br0 type bridge mcast_snooping 0
 	ip link add dev br1 type bridge mcast_snooping 0
 
-	ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
-		ttl 20 tos inherit local 198.51.100.1 dstport 4789
-	ip link add name vxlan1 up type vxlan id 20 nolearning noudpcsum \
-		ttl 20 tos inherit local 198.51.100.1 dstport 4789
+	ip link add name vxlan0 up type vxlan id 10 nolearning $UDPCSUM_FLAFS \
+		ttl 20 tos inherit local $LOCAL_IP_1 dstport 4789
+	ip link add name vxlan1 up type vxlan id 20 nolearning $UDPCSUM_FLAFS \
+		ttl 20 tos inherit local $LOCAL_IP_1 dstport 4789
 
 	sanitization_multi_devs_test_pass
 
@@ -338,10 +363,10 @@ sanitization_multi_devs_ttl_test()
 	ip link add dev br0 type bridge mcast_snooping 0
 	ip link add dev br1 type bridge mcast_snooping 0
 
-	ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
-		ttl 20 tos inherit local 198.51.100.1 dstport 4789
-	ip link add name vxlan1 up type vxlan id 20 nolearning noudpcsum \
-		ttl 40 tos inherit local 198.51.100.1 dstport 4789
+	ip link add name vxlan0 up type vxlan id 10 nolearning $UDPCSUM_FLAFS \
+		ttl 20 tos inherit local $LOCAL_IP_1 dstport 4789
+	ip link add name vxlan1 up type vxlan id 20 nolearning $UDPCSUM_FLAFS \
+		ttl 40 tos inherit local $LOCAL_IP_1 dstport 4789
 
 	sanitization_multi_devs_test_fail
 
@@ -360,10 +385,10 @@ sanitization_multi_devs_udp_dstport_test()
 	ip link add dev br0 type bridge mcast_snooping 0
 	ip link add dev br1 type bridge mcast_snooping 0
 
-	ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
-		ttl 20 tos inherit local 198.51.100.1 dstport 4789
-	ip link add name vxlan1 up type vxlan id 20 nolearning noudpcsum \
-		ttl 20 tos inherit local 198.51.100.1 dstport 5789
+	ip link add name vxlan0 up type vxlan id 10 nolearning $UDPCSUM_FLAFS \
+		ttl 20 tos inherit local $LOCAL_IP_1 dstport 4789
+	ip link add name vxlan1 up type vxlan id 20 nolearning $UDPCSUM_FLAFS \
+		ttl 20 tos inherit local $LOCAL_IP_1 dstport 5789
 
 	sanitization_multi_devs_test_fail
 
@@ -382,10 +407,10 @@ sanitization_multi_devs_local_ip_test()
 	ip link add dev br0 type bridge mcast_snooping 0
 	ip link add dev br1 type bridge mcast_snooping 0
 
-	ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
-		ttl 20 tos inherit local 198.51.100.1 dstport 4789
-	ip link add name vxlan1 up type vxlan id 20 nolearning noudpcsum \
-		ttl 20 tos inherit local 198.51.100.2 dstport 4789
+	ip link add name vxlan0 up type vxlan id 10 nolearning $UDPCSUM_FLAFS \
+		ttl 20 tos inherit local $LOCAL_IP_1 dstport 4789
+	ip link add name vxlan1 up type vxlan id 20 nolearning $UDPCSUM_FLAFS \
+		ttl 20 tos inherit local $LOCAL_IP_2 dstport 4789
 
 	sanitization_multi_devs_test_fail
 
@@ -425,12 +450,12 @@ offload_indication_setup_create()
 	ip link set dev $swp1 master br0
 	ip link set dev $swp2 master br1
 
-	ip address add 198.51.100.1/32 dev lo
+	ip address add $LOCAL_IP_1/$PREFIX_LEN dev lo
 
 	ip link add name vxlan0 up master br0 type vxlan id 10 nolearning \
-		noudpcsum ttl 20 tos inherit local 198.51.100.1 dstport 4789
+		$UDPCSUM_FLAFS ttl 20 tos inherit local $LOCAL_IP_1 dstport 4789
 	ip link add name vxlan1 up master br1 type vxlan id 20 nolearning \
-		noudpcsum ttl 20 tos inherit local 198.51.100.1 dstport 4789
+		$UDPCSUM_FLAFS ttl 20 tos inherit local $LOCAL_IP_1 dstport 4789
 }
 
 offload_indication_setup_destroy()
@@ -438,7 +463,7 @@ offload_indication_setup_destroy()
 	ip link del dev vxlan1
 	ip link del dev vxlan0
 
-	ip address del 198.51.100.1/32 dev lo
+	ip address del $LOCAL_IP_1/$PREFIX_LEN dev lo
 
 	ip link set dev $swp2 nomaster
 	ip link set dev $swp1 nomaster
@@ -451,7 +476,7 @@ offload_indication_fdb_flood_test()
 {
 	RET=0
 
-	bridge fdb append 00:00:00:00:00:00 dev vxlan0 self dst 198.51.100.2
+	bridge fdb append 00:00:00:00:00:00 dev vxlan0 self dst $LOCAL_IP_2
 
 	busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb 00:00:00:00:00:00 \
 		bridge fdb show brport vxlan0
@@ -467,7 +492,7 @@ offload_indication_fdb_bridge_test()
 	RET=0
 
 	bridge fdb add de:ad:be:ef:13:37 dev vxlan0 self master static \
-		dst 198.51.100.2
+		dst $LOCAL_IP_2
 
 	busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \
 		de:ad:be:ef:13:37 self bridge fdb show brport vxlan0
@@ -518,7 +543,7 @@ offload_indication_fdb_bridge_test()
 	# marked as offloaded in both drivers
 	RET=0
 
-	bridge fdb add de:ad:be:ef:13:37 dev vxlan0 self dst 198.51.100.2
+	bridge fdb add de:ad:be:ef:13:37 dev vxlan0 self dst $LOCAL_IP_2
 	busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \
 		de:ad:be:ef:13:37 self bridge fdb show brport vxlan0
 	check_err $?
@@ -542,17 +567,17 @@ offload_indication_decap_route_test()
 	RET=0
 
 	busywait "$TIMEOUT" wait_for_offload \
-		ip route show table local 198.51.100.1
+		ip $IP_FLAG route show table local $LOCAL_IP_1
 	check_err $?
 
 	ip link set dev vxlan0 down
 	busywait "$TIMEOUT" wait_for_offload \
-		ip route show table local 198.51.100.1
+		ip $IP_FLAG route show table local $LOCAL_IP_1
 	check_err $?
 
 	ip link set dev vxlan1 down
 	busywait "$TIMEOUT" not wait_for_offload \
-		ip route show table local 198.51.100.1
+		ip $IP_FLAG route show table local $LOCAL_IP_1
 	check_err $?
 
 	log_test "vxlan decap route - vxlan device down"
@@ -561,26 +586,26 @@ offload_indication_decap_route_test()
 
 	ip link set dev vxlan1 up
 	busywait "$TIMEOUT" wait_for_offload \
-		ip route show table local 198.51.100.1
+		ip $IP_FLAG route show table local $LOCAL_IP_1
 	check_err $?
 
 	ip link set dev vxlan0 up
 	busywait "$TIMEOUT" wait_for_offload \
-		ip route show table local 198.51.100.1
+		ip $IP_FLAG route show table local $LOCAL_IP_1
 	check_err $?
 
 	log_test "vxlan decap route - vxlan device up"
 
 	RET=0
 
-	ip address delete 198.51.100.1/32 dev lo
+	ip address delete $LOCAL_IP_1/$PREFIX_LEN dev lo
 	busywait "$TIMEOUT" not wait_for_offload \
-		ip route show table local 198.51.100.1
+		ip $IP_FLAG route show table local $LOCAL_IP_1
 	check_err $?
 
-	ip address add 198.51.100.1/32 dev lo
+	ip address add $LOCAL_IP_1/$PREFIX_LEN dev lo
 	busywait "$TIMEOUT" wait_for_offload \
-		ip route show table local 198.51.100.1
+		ip $IP_FLAG route show table local $LOCAL_IP_1
 	check_err $?
 
 	log_test "vxlan decap route - add local route"
@@ -589,18 +614,18 @@ offload_indication_decap_route_test()
 
 	ip link set dev $swp1 nomaster
 	busywait "$TIMEOUT" wait_for_offload \
-		ip route show table local 198.51.100.1
+		ip $IP_FLAG route show table local $LOCAL_IP_1
 	check_err $?
 
 	ip link set dev $swp2 nomaster
 	busywait "$TIMEOUT" not wait_for_offload \
-		ip route show table local 198.51.100.1
+		ip $IP_FLAG route show table local $LOCAL_IP_1
 	check_err $?
 
 	ip link set dev $swp1 master br0
 	ip link set dev $swp2 master br1
 	busywait "$TIMEOUT" wait_for_offload \
-		ip route show table local 198.51.100.1
+		ip $IP_FLAG route show table local $LOCAL_IP_1
 	check_err $?
 
 	log_test "vxlan decap route - local ports enslavement"
@@ -609,12 +634,12 @@ offload_indication_decap_route_test()
 
 	ip link del dev br0
 	busywait "$TIMEOUT" wait_for_offload \
-		ip route show table local 198.51.100.1
+		ip $IP_FLAG route show table local $LOCAL_IP_1
 	check_err $?
 
 	ip link del dev br1
 	busywait "$TIMEOUT" not wait_for_offload \
-		ip route show table local 198.51.100.1
+		ip $IP_FLAG route show table local $LOCAL_IP_1
 	check_err $?
 
 	log_test "vxlan decap route - bridge device deletion"
@@ -628,25 +653,25 @@ offload_indication_decap_route_test()
 	ip link set dev vxlan0 master br0
 	ip link set dev vxlan1 master br1
 	busywait "$TIMEOUT" wait_for_offload \
-		ip route show table local 198.51.100.1
+		ip $IP_FLAG route show table local $LOCAL_IP_1
 	check_err $?
 
 	ip link del dev vxlan0
 	busywait "$TIMEOUT" wait_for_offload \
-		ip route show table local 198.51.100.1
+		ip $IP_FLAG route show table local $LOCAL_IP_1
 	check_err $?
 
 	ip link del dev vxlan1
 	busywait "$TIMEOUT" not wait_for_offload \
-		ip route show table local 198.51.100.1
+		ip $IP_FLAG route show table local $LOCAL_IP_1
 	check_err $?
 
 	log_test "vxlan decap route - vxlan device deletion"
 
 	ip link add name vxlan0 up master br0 type vxlan id 10 nolearning \
-		noudpcsum ttl 20 tos inherit local 198.51.100.1 dstport 4789
+		$UDPCSUM_FLAFS ttl 20 tos inherit local $LOCAL_IP_1 dstport 4789
 	ip link add name vxlan1 up master br1 type vxlan id 20 nolearning \
-		noudpcsum ttl 20 tos inherit local 198.51.100.1 dstport 4789
+		$UDPCSUM_FLAFS ttl 20 tos inherit local $LOCAL_IP_1 dstport 4789
 }
 
 check_fdb_offloaded()
@@ -703,10 +728,10 @@ __offload_indication_join_vxlan_first()
 	local mac=00:11:22:33:44:55
 	local zmac=00:00:00:00:00:00
 
-	bridge fdb append $zmac dev vxlan0 self dst 198.51.100.2
+	bridge fdb append $zmac dev vxlan0 self dst $LOCAL_IP_2
 
 	ip link set dev vxlan0 master br0
-	bridge fdb add dev vxlan0 $mac self master static dst 198.51.100.2
+	bridge fdb add dev vxlan0 $mac self master static dst $LOCAL_IP_2
 
 	RET=0
 	check_vxlan_fdb_not_offloaded
@@ -756,8 +781,8 @@ __offload_indication_join_vxlan_first()
 offload_indication_join_vxlan_first()
 {
 	ip link add dev br0 up type bridge mcast_snooping 0
-	ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
-		ttl 20 tos inherit local 198.51.100.1 dstport 4789
+	ip link add name vxlan0 up type vxlan id 10 nolearning $UDPCSUM_FLAFS \
+		ttl 20 tos inherit local $LOCAL_IP_1 dstport 4789
 
 	__offload_indication_join_vxlan_first
 
@@ -771,7 +796,7 @@ __offload_indication_join_vxlan_last()
 
 	RET=0
 
-	bridge fdb append $zmac dev vxlan0 self dst 198.51.100.2
+	bridge fdb append $zmac dev vxlan0 self dst $LOCAL_IP_2
 
 	ip link set dev $swp1 master br0
 
@@ -791,8 +816,8 @@ __offload_indication_join_vxlan_last()
 offload_indication_join_vxlan_last()
 {
 	ip link add dev br0 up type bridge mcast_snooping 0
-	ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
-		ttl 20 tos inherit local 198.51.100.1 dstport 4789
+	ip link add name vxlan0 up type vxlan id 10 nolearning $UDPCSUM_FLAFS \
+		ttl 20 tos inherit local $LOCAL_IP_1 dstport 4789
 
 	__offload_indication_join_vxlan_last
 
@@ -819,10 +844,10 @@ sanitization_vlan_aware_test()
 	ip link add dev br0 type bridge mcast_snooping 0 vlan_filtering 1
 
 	ip link add name vxlan10 up master br0 type vxlan id 10 nolearning \
-		noudpcsum ttl 20 tos inherit local 198.51.100.1 dstport 4789
+		$UDPCSUM_FLAFS ttl 20 tos inherit local $LOCAL_IP_1 dstport 4789
 
 	ip link add name vxlan20 up master br0 type vxlan id 20 nolearning \
-		noudpcsum ttl 20 tos inherit local 198.51.100.1 dstport 4789
+		$UDPCSUM_FLAFS ttl 20 tos inherit local $LOCAL_IP_1 dstport 4789
 
 	# Test that when each VNI is mapped to a different VLAN we can enslave
 	# a port to the bridge
@@ -866,20 +891,20 @@ sanitization_vlan_aware_test()
 
 	# Use the offload indication of the local route to ensure the VXLAN
 	# configuration was correctly rollbacked.
-	ip address add 198.51.100.1/32 dev lo
+	ip address add $LOCAL_IP_1/$PREFIX_LEN dev lo
 
 	ip link set dev vxlan10 type vxlan ttl 10
 	ip link set dev $swp1 master br0 &> /dev/null
 	check_fail $?
 
 	busywait "$TIMEOUT" not wait_for_offload \
-		ip route show table local 198.51.100.1
+		ip $IP_FLAG route show table local $LOCAL_IP_1
 	check_err $?
 
 	log_test "vlan-aware - failed enslavement to bridge due to conflict"
 
 	ip link set dev vxlan10 type vxlan ttl 20
-	ip address del 198.51.100.1/32 dev lo
+	ip address del $LOCAL_IP_1/$PREFIX_LEN dev lo
 
 	ip link del dev vxlan20
 	ip link del dev vxlan10
@@ -898,12 +923,12 @@ offload_indication_vlan_aware_setup_create()
 	bridge vlan add vid 10 dev $swp1
 	bridge vlan add vid 20 dev $swp1
 
-	ip address add 198.51.100.1/32 dev lo
+	ip address add $LOCAL_IP_1/$PREFIX_LEN dev lo
 
 	ip link add name vxlan10 up master br0 type vxlan id 10 nolearning \
-		noudpcsum ttl 20 tos inherit local 198.51.100.1 dstport 4789
+		$UDPCSUM_FLAFS ttl 20 tos inherit local $LOCAL_IP_1 dstport 4789
 	ip link add name vxlan20 up master br0 type vxlan id 20 nolearning \
-		noudpcsum ttl 20 tos inherit local 198.51.100.1 dstport 4789
+		$UDPCSUM_FLAFS ttl 20 tos inherit local $LOCAL_IP_1 dstport 4789
 
 	bridge vlan add vid 10 dev vxlan10 pvid untagged
 	bridge vlan add vid 20 dev vxlan20 pvid untagged
@@ -917,7 +942,7 @@ offload_indication_vlan_aware_setup_destroy()
 	ip link del dev vxlan20
 	ip link del dev vxlan10
 
-	ip address del 198.51.100.1/32 dev lo
+	ip address del $LOCAL_IP_1/$PREFIX_LEN dev lo
 
 	bridge vlan del vid 20 dev $swp1
 	bridge vlan del vid 10 dev $swp1
@@ -934,7 +959,7 @@ offload_indication_vlan_aware_fdb_test()
 	log_info "vxlan entry offload indication - vlan-aware"
 
 	bridge fdb add de:ad:be:ef:13:37 dev vxlan10 self master static \
-		dst 198.51.100.2 vlan 10
+		dst $LOCAL_IP_2 vlan 10
 
 	busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \
 		de:ad:be:ef:13:37 self bridge fdb show brport vxlan10
@@ -985,7 +1010,7 @@ offload_indication_vlan_aware_fdb_test()
 	# marked as offloaded in both drivers
 	RET=0
 
-	bridge fdb add de:ad:be:ef:13:37 dev vxlan10 self dst 198.51.100.2
+	bridge fdb add de:ad:be:ef:13:37 dev vxlan10 self dst $LOCAL_IP_2
 	busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \
 		de:ad:be:ef:13:37 self bridge fdb show brport vxlan10
 	check_err $?
@@ -1003,7 +1028,7 @@ offload_indication_vlan_aware_decap_route_test()
 	RET=0
 
 	busywait "$TIMEOUT" wait_for_offload \
-		ip route show table local 198.51.100.1
+		ip $IP_FLAG route show table local $LOCAL_IP_1
 	check_err $?
 
 	# Toggle PVID flag on one VxLAN device and make sure route is still
@@ -1011,7 +1036,7 @@ offload_indication_vlan_aware_decap_route_test()
 	bridge vlan add vid 10 dev vxlan10 untagged
 
 	busywait "$TIMEOUT" wait_for_offload \
-		ip route show table local 198.51.100.1
+		ip $IP_FLAG route show table local $LOCAL_IP_1
 	check_err $?
 
 	# Toggle PVID flag on second VxLAN device and make sure route is no
@@ -1019,14 +1044,15 @@ offload_indication_vlan_aware_decap_route_test()
 	bridge vlan add vid 20 dev vxlan20 untagged
 
 	busywait "$TIMEOUT" not wait_for_offload \
-		ip route show table local 198.51.100.1
+		ip $IP_FLAG route show table local $LOCAL_IP_1
 	check_err $?
 
 	# Toggle PVID flag back and make sure route is marked as offloaded
 	bridge vlan add vid 10 dev vxlan10 pvid untagged
 	bridge vlan add vid 20 dev vxlan20 pvid untagged
 
-	busywait "$TIMEOUT" wait_for_offload ip route show table local 198.51.100.1
+	busywait "$TIMEOUT" wait_for_offload ip $IP_FLAG route show table local \
+		$LOCAL_IP_1
 	check_err $?
 
 	log_test "vxlan decap route - vni map/unmap"
@@ -1036,8 +1062,8 @@ offload_indication_vlan_aware_join_vxlan_first()
 {
 	ip link add dev br0 up type bridge mcast_snooping 0 \
 		vlan_filtering 1 vlan_default_pvid 1
-	ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
-		ttl 20 tos inherit local 198.51.100.1 dstport 4789
+	ip link add name vxlan0 up type vxlan id 10 nolearning $UDPCSUM_FLAFS \
+		ttl 20 tos inherit local $LOCAL_IP_1 dstport 4789
 
 	__offload_indication_join_vxlan_first 1
 
@@ -1049,8 +1075,8 @@ offload_indication_vlan_aware_join_vxlan_last()
 {
 	ip link add dev br0 up type bridge mcast_snooping 0 \
 		vlan_filtering 1 vlan_default_pvid 1
-	ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
-		ttl 20 tos inherit local 198.51.100.1 dstport 4789
+	ip link add name vxlan0 up type vxlan id 10 nolearning $UDPCSUM_FLAFS \
+		ttl 20 tos inherit local $LOCAL_IP_1 dstport 4789
 
 	__offload_indication_join_vxlan_last
 
@@ -1067,14 +1093,14 @@ offload_indication_vlan_aware_l3vni_test()
 	sysctl_set net.ipv6.conf.default.disable_ipv6 1
 	ip link add dev br0 up type bridge mcast_snooping 0 \
 		vlan_filtering 1 vlan_default_pvid 0
-	ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
-		ttl 20 tos inherit local 198.51.100.1 dstport 4789
+	ip link add name vxlan0 up type vxlan id 10 nolearning $UDPCSUM_FLAFS \
+		ttl 20 tos inherit local $LOCAL_IP_1 dstport 4789
 
 	ip link set dev $swp1 master br0
 
 	# The test will use the offload indication on the FDB entry to
 	# understand if the tunnel is offloaded or not
-	bridge fdb append $zmac dev vxlan0 self dst 192.0.2.1
+	bridge fdb append $zmac dev vxlan0 self dst $LOCAL_IP_2
 
 	ip link set dev vxlan0 master br0
 	bridge vlan add dev vxlan0 vid 10 pvid untagged
-- 
cgit v1.2.3


From 21d4282dc1b81249184525da1acdb308288a952c Mon Sep 17 00:00:00 2001
From: Amit Cohen <amcohen@nvidia.com>
Date: Thu, 23 Dec 2021 09:29:56 +0200
Subject: selftests: mlxsw: Add VxLAN configuration test for IPv6

Add test to verify configuration of VxLAN with IPv6 underlay.
Use the existing test which checks IPv4.

Add separated test cases for learning which is not supported for IPv6
and for UDP checksum flags which are different from IPv4 flags.

Signed-off-by: Amit Cohen <amcohen@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../selftests/drivers/net/mlxsw/vxlan_ipv6.sh      | 65 ++++++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100755 tools/testing/selftests/drivers/net/mlxsw/vxlan_ipv6.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/mlxsw/vxlan_ipv6.sh b/tools/testing/selftests/drivers/net/mlxsw/vxlan_ipv6.sh
new file mode 100755
index 000000000000..f2ea0163ddea
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/vxlan_ipv6.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# A wrapper to run VXLAN test for IPv6.
+
+ADDR_FAMILY=ipv6
+LOCAL_IP_1=2001:db8:1::1
+LOCAL_IP_2=2001:db8:1::2
+PREFIX_LEN=128
+UDPCSUM_FLAFS="udp6zerocsumrx udp6zerocsumtx"
+MC_IP=FF02::2
+IP_FLAG="-6"
+
+ALL_TESTS="
+	sanitization_test
+	offload_indication_test
+	sanitization_vlan_aware_test
+	offload_indication_vlan_aware_test
+"
+
+sanitization_single_dev_learning_enabled_ipv6_test()
+{
+	RET=0
+
+	ip link add dev br0 type bridge mcast_snooping 0
+
+	ip link add name vxlan0 up type vxlan id 10 learning $UDPCSUM_FLAFS \
+		ttl 20 tos inherit local $LOCAL_IP_1 dstport 4789
+
+	sanitization_single_dev_test_fail
+
+	ip link del dev vxlan0
+	ip link del dev br0
+
+	log_test "vxlan device with learning enabled"
+}
+
+sanitization_single_dev_udp_checksum_ipv6_test()
+{
+	RET=0
+
+	ip link add dev br0 type bridge mcast_snooping 0
+
+	ip link add name vxlan0 up type vxlan id 10 nolearning \
+		noudp6zerocsumrx udp6zerocsumtx ttl 20 tos inherit \
+		local $LOCAL_IP_1 dstport 4789
+
+	sanitization_single_dev_test_fail
+	log_test "vxlan device without zero udp checksum at RX"
+
+	ip link del dev vxlan0
+
+	ip link add name vxlan0 up type vxlan id 10 nolearning \
+		udp6zerocsumrx noudp6zerocsumtx ttl 20 tos inherit \
+		local $LOCAL_IP_1 dstport 4789
+
+	sanitization_single_dev_test_fail
+	log_test "vxlan device without zero udp checksum at TX"
+
+	ip link del dev vxlan0
+	ip link del dev br0
+
+}
+
+source vxlan.sh
-- 
cgit v1.2.3


From 696285305b32f4fc1ddfb2d158ef4bbb6085ab92 Mon Sep 17 00:00:00 2001
From: Amit Cohen <amcohen@nvidia.com>
Date: Thu, 23 Dec 2021 09:29:57 +0200
Subject: selftests: mlxsw: vxlan_fdb_veto: Make the test more flexible for
 future use

vxlan_fdb_veto.sh cases are dedicated to test VxLAN with IPv4 underlay.
The main changes to test IPv6 underlay are IP addresses and some flags.

Add variables to define all the values which supposed to be different
for IPv6 testing, set them to use the existing values by default.

The next patch will define the new added variables in a separated file,
so the same tests can be used for IPv6 also.

Signed-off-by: Amit Cohen <amcohen@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../selftests/drivers/net/mlxsw/vxlan_fdb_veto.sh  | 39 +++++++++++++++-------
 1 file changed, 27 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/mlxsw/vxlan_fdb_veto.sh b/tools/testing/selftests/drivers/net/mlxsw/vxlan_fdb_veto.sh
index 749ba3cfda1d..38148f51877a 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/vxlan_fdb_veto.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/vxlan_fdb_veto.sh
@@ -4,6 +4,21 @@
 # Test vetoing of FDB entries that mlxsw can not offload. This exercises several
 # different veto vectors to test various rollback scenarios in the vxlan driver.
 
+: ${LOCAL_IP:=198.51.100.1}
+export LOCAL_IP
+
+: ${REMOTE_IP_1:=198.51.100.2}
+export REMOTE_IP_1
+
+: ${REMOTE_IP_2:=198.51.100.3}
+export REMOTE_IP_2
+
+: ${UDPCSUM_FLAFS:=noudpcsum}
+export UDPCSUM_FLAFS
+
+: ${MC_IP:=224.0.0.1}
+export MC_IP
+
 lib_dir=$(dirname $0)/../../../net/forwarding
 
 ALL_TESTS="
@@ -26,8 +41,8 @@ setup_prepare()
 	ip link set dev $swp1 master br0
 	ip link set dev $swp2 up
 
-	ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
-		ttl 20 tos inherit local 198.51.100.1 dstport 4789
+	ip link add name vxlan0 up type vxlan id 10 nolearning $UDPCSUM_FLAFS \
+		ttl 20 tos inherit local $LOCAL_IP dstport 4789
 	ip link set dev vxlan0 master br0
 }
 
@@ -50,11 +65,11 @@ fdb_create_veto_test()
 	RET=0
 
 	bridge fdb add 01:02:03:04:05:06 dev vxlan0 self static \
-	       dst 198.51.100.2 2>/dev/null
+	       dst $REMOTE_IP_1 2>/dev/null
 	check_fail $? "multicast MAC not rejected"
 
 	bridge fdb add 01:02:03:04:05:06 dev vxlan0 self static \
-	       dst 198.51.100.2 2>&1 >/dev/null | grep -q mlxsw_spectrum
+	       dst $REMOTE_IP_1 2>&1 >/dev/null | grep -q mlxsw_spectrum
 	check_err $? "multicast MAC rejected without extack"
 
 	log_test "vxlan FDB veto - create"
@@ -65,15 +80,15 @@ fdb_replace_veto_test()
 	RET=0
 
 	bridge fdb add 00:01:02:03:04:05 dev vxlan0 self static \
-	       dst 198.51.100.2
+	       dst $REMOTE_IP_1
 	check_err $? "valid FDB rejected"
 
 	bridge fdb replace 00:01:02:03:04:05 dev vxlan0 self static \
-	       dst 198.51.100.2 port 1234 2>/dev/null
+	       dst $REMOTE_IP_1 port 1234 2>/dev/null
 	check_fail $? "FDB with an explicit port not rejected"
 
 	bridge fdb replace 00:01:02:03:04:05 dev vxlan0 self static \
-	       dst 198.51.100.2 port 1234 2>&1 >/dev/null \
+	       dst $REMOTE_IP_1 port 1234 2>&1 >/dev/null \
 	    | grep -q mlxsw_spectrum
 	check_err $? "FDB with an explicit port rejected without extack"
 
@@ -85,15 +100,15 @@ fdb_append_veto_test()
 	RET=0
 
 	bridge fdb add 00:00:00:00:00:00 dev vxlan0 self static \
-	       dst 198.51.100.2
+	       dst $REMOTE_IP_1
 	check_err $? "valid FDB rejected"
 
 	bridge fdb append 00:00:00:00:00:00 dev vxlan0 self static \
-	       dst 198.51.100.3 port 1234 2>/dev/null
+	       dst $REMOTE_IP_2 port 1234 2>/dev/null
 	check_fail $? "FDB with an explicit port not rejected"
 
 	bridge fdb append 00:00:00:00:00:00 dev vxlan0 self static \
-	       dst 198.51.100.3 port 1234 2>&1 >/dev/null \
+	       dst $REMOTE_IP_2 port 1234 2>&1 >/dev/null \
 	    | grep -q mlxsw_spectrum
 	check_err $? "FDB with an explicit port rejected without extack"
 
@@ -105,11 +120,11 @@ fdb_changelink_veto_test()
 	RET=0
 
 	ip link set dev vxlan0 type vxlan \
-	   group 224.0.0.1 dev lo 2>/dev/null
+	   group $MC_IP dev lo 2>/dev/null
 	check_fail $? "FDB with a multicast IP not rejected"
 
 	ip link set dev vxlan0 type vxlan \
-	   group 224.0.0.1 dev lo 2>&1 >/dev/null \
+	   group $MC_IP dev lo 2>&1 >/dev/null \
 	    | grep -q mlxsw_spectrum
 	check_err $? "FDB with a multicast IP rejected without extack"
 
-- 
cgit v1.2.3


From 1c7b183dac8917f6329cc809ac61cdf9d3ce9f3f Mon Sep 17 00:00:00 2001
From: Amit Cohen <amcohen@nvidia.com>
Date: Thu, 23 Dec 2021 09:29:58 +0200
Subject: selftests: mlxsw: Add VxLAN FDB veto test for IPv6

Add test to verify FDB vetos of VxLAN with IPv6 underlay.
Use the existing test which checks IPv4.

Signed-off-by: Amit Cohen <amcohen@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../selftests/drivers/net/mlxsw/vxlan_fdb_veto_ipv6.sh       | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100755 tools/testing/selftests/drivers/net/mlxsw/vxlan_fdb_veto_ipv6.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/mlxsw/vxlan_fdb_veto_ipv6.sh b/tools/testing/selftests/drivers/net/mlxsw/vxlan_fdb_veto_ipv6.sh
new file mode 100755
index 000000000000..66c87aab86f6
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/vxlan_fdb_veto_ipv6.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# A wrapper to run VXLAN test for IPv6.
+
+LOCAL_IP=2001:db8:1::1
+REMOTE_IP_1=2001:db8:2::1
+REMOTE_IP_2=2001:db8:3::1
+UDPCSUM_FLAFS="udp6zerocsumrx udp6zerocsumtx"
+MC_IP=FF02::2
+
+source vxlan_fdb_veto.sh
-- 
cgit v1.2.3


From 7ae23eddfa3e5c1d1892c19a50cfe52b4af300cf Mon Sep 17 00:00:00 2001
From: Amit Cohen <amcohen@nvidia.com>
Date: Thu, 23 Dec 2021 09:29:59 +0200
Subject: selftests: mlxsw: spectrum: Add a test for VxLAN flooding with IPv6

The device stores flood records in a singly linked list where each
record stores up to X IP addresses of remote VTEPs.

The number of records is changed according to ASIC type and address
family.

Add a test which is similar to the existing IPv4 test to check IPv6.
The test is dedicated for Spectrum-1 switches, which support up to five
IPv6 addresses in one record.

The test verifies that packets are correctly flooded in various cases such
as deletion of a record in the middle of the list.

Signed-off-by: Amit Cohen <amcohen@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/mlxsw/spectrum/vxlan_flooding_ipv6.sh      | 334 +++++++++++++++++++++
 1 file changed, 334 insertions(+)
 create mode 100755 tools/testing/selftests/drivers/net/mlxsw/spectrum/vxlan_flooding_ipv6.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/vxlan_flooding_ipv6.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum/vxlan_flooding_ipv6.sh
new file mode 100755
index 000000000000..d8fd875ad527
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/vxlan_flooding_ipv6.sh
@@ -0,0 +1,334 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test VxLAN flooding. The device stores flood records in a singly linked list
+# where each record stores up to five IPv6 addresses of remote VTEPs. The test
+# verifies that packets are correctly flooded in various cases such as deletion
+# of a record in the middle of the list.
+#
+# +-----------------------+
+# | H1 (vrf)              |
+# |    + $h1              |
+# |    | 2001:db8:1::1/64 |
+# +----|------------------+
+#      |
+# +----|----------------------------------------------------------------------+
+# | SW |                                                                      |
+# | +--|--------------------------------------------------------------------+ |
+# | |  + $swp1                   BR0 (802.1d)                               | |
+# | |                                                                       | |
+# | |  + vxlan0 (vxlan)                                                     | |
+# | |    local 2001:db8:2::1                                                | |
+# | |    remote 2001:db8:2::{2..21}                                         | |
+# | |    id 10 dstport 4789                                                 | |
+# | +-----------------------------------------------------------------------+ |
+# |                                                                           |
+# |  2001:db8:2::0/64 via 2001:db8:3::2                                       |
+# |                                                                           |
+# |    + $rp1                                                                 |
+# |    | 2001:db8:3::1/64                                                     |
+# +----|----------------------------------------------------------------------+
+#      |
+# +----|--------------------------------------------------------+
+# |    |                                               R2 (vrf) |
+# |    + $rp2                                                   |
+# |      2001:db8:3::2/64                                       |
+# |                                                             |
+# +-------------------------------------------------------------+
+
+lib_dir=$(dirname $0)/../../../../net/forwarding
+
+ALL_TESTS="flooding_test"
+NUM_NETIFS=4
+source $lib_dir/tc_common.sh
+source $lib_dir/lib.sh
+
+h1_create()
+{
+	simple_if_init $h1 2001:db8:1::1/64
+}
+
+h1_destroy()
+{
+	simple_if_fini $h1 2001:db8:1::1/64
+}
+
+switch_create()
+{
+	# Make sure the bridge uses the MAC address of the local port and
+	# not that of the VxLAN's device
+	ip link add dev br0 type bridge mcast_snooping 0
+	ip link set dev br0 address $(mac_get $swp1)
+
+	ip link add name vxlan0 type vxlan id 10 nolearning \
+		udp6zerocsumrx udp6zerocsumtx ttl 20 tos inherit \
+		local 2001:db8:2::1 dstport 4789
+
+	ip address add 2001:db8:2::1/128 dev lo
+
+	ip link set dev $swp1 master br0
+	ip link set dev vxlan0 master br0
+
+	ip link set dev br0 up
+	ip link set dev $swp1 up
+	ip link set dev vxlan0 up
+}
+
+switch_destroy()
+{
+	ip link set dev vxlan0 down
+	ip link set dev $swp1 down
+	ip link set dev br0 down
+
+	ip link set dev vxlan0 nomaster
+	ip link set dev $swp1 nomaster
+
+	ip address del 2001:db8:2::1/128 dev lo
+
+	ip link del dev vxlan0
+
+	ip link del dev br0
+}
+
+router1_create()
+{
+	# This router is in the default VRF, where the VxLAN device is
+	# performing the L3 lookup
+	ip link set dev $rp1 up
+	ip address add 2001:db8:3::1/64 dev $rp1
+	ip route add 2001:db8:2::0/64 via 2001:db8:3::2
+}
+
+router1_destroy()
+{
+	ip route del 2001:db8:2::0/64 via 2001:db8:3::2
+	ip address del 2001:db8:3::1/64 dev $rp1
+	ip link set dev $rp1 down
+}
+
+router2_create()
+{
+	# This router is not in the default VRF, so use simple_if_init()
+	simple_if_init $rp2 2001:db8:3::2/64
+}
+
+router2_destroy()
+{
+	simple_if_fini $rp2 2001:db8:3::2/64
+}
+
+setup_prepare()
+{
+	h1=${NETIFS[p1]}
+	swp1=${NETIFS[p2]}
+
+	rp1=${NETIFS[p3]}
+	rp2=${NETIFS[p4]}
+
+	vrf_prepare
+
+	h1_create
+
+	switch_create
+
+	router1_create
+	router2_create
+
+	forwarding_enable
+}
+
+cleanup()
+{
+	pre_cleanup
+
+	forwarding_restore
+
+	router2_destroy
+	router1_destroy
+
+	switch_destroy
+
+	h1_destroy
+
+	vrf_cleanup
+}
+
+flooding_remotes_add()
+{
+	local num_remotes=$1
+	local lsb
+	local i
+
+	for i in $(eval echo {1..$num_remotes}); do
+		lsb=$((i + 1))
+
+		bridge fdb append 00:00:00:00:00:00 dev vxlan0 self \
+			dst 2001:db8:2::$lsb
+	done
+}
+
+flooding_filters_add()
+{
+	local num_remotes=$1
+	local lsb
+	local i
+
+	tc qdisc add dev $rp2 clsact
+
+	for i in $(eval echo {1..$num_remotes}); do
+		lsb=$((i + 1))
+
+		tc filter add dev $rp2 ingress protocol ipv6 pref $i handle $i \
+			flower ip_proto udp dst_ip 2001:db8:2::$lsb \
+			dst_port 4789 skip_sw action drop
+	done
+}
+
+flooding_filters_del()
+{
+	local num_remotes=$1
+	local i
+
+	for i in $(eval echo {1..$num_remotes}); do
+		tc filter del dev $rp2 ingress protocol ipv6 pref $i \
+			handle $i flower
+	done
+
+	tc qdisc del dev $rp2 clsact
+}
+
+flooding_check_packets()
+{
+	local packets=("$@")
+	local num_remotes=${#packets[@]}
+	local i
+
+	for i in $(eval echo {1..$num_remotes}); do
+		tc_check_packets "dev $rp2 ingress" $i ${packets[i - 1]}
+		check_err $? "remote $i - did not get expected number of packets"
+	done
+}
+
+flooding_test()
+{
+	# Use 20 remote VTEPs that will be stored in 4 records. The array
+	# 'packets' will store how many packets are expected to be received
+	# by each remote VTEP at each stage of the test
+	declare -a packets=(1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1)
+	local num_remotes=20
+
+	RET=0
+
+	# Add FDB entries for remote VTEPs and corresponding tc filters on the
+	# ingress of the nexthop router. These filters will count how many
+	# packets were flooded to each remote VTEP
+	flooding_remotes_add $num_remotes
+	flooding_filters_add $num_remotes
+
+	# Send one packet and make sure it is flooded to all the remote VTEPs
+	$MZ -6 $h1 -q -p 64 -b de:ad:be:ef:13:37 -t ip -c 1
+	flooding_check_packets "${packets[@]}"
+	log_test "flood after 1 packet"
+
+	# Delete the third record which corresponds to VTEPs with LSB 12..16
+	# and check that packet is flooded correctly when we remove a record
+	# from the middle of the list
+	RET=0
+
+	packets=(2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 2 2 2 2 2)
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::12
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::13
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::14
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::15
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::16
+
+	$MZ -6 $h1 -q -p 64 -b de:ad:be:ef:13:37 -t ip -c 1
+	flooding_check_packets "${packets[@]}"
+	log_test "flood after 2 packets"
+
+	# Delete the first record and make sure the packet is flooded correctly
+	RET=0
+
+	packets=(2 2 2 2 2 3 3 3 3 3 1 1 1 1 1 3 3 3 3 3)
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::2
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::3
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::4
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::5
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::6
+
+	$MZ -6 $h1 -q -p 64 -b de:ad:be:ef:13:37 -t ip -c 1
+	flooding_check_packets "${packets[@]}"
+	log_test "flood after 3 packets"
+
+	# Delete the last record and make sure the packet is flooded correctly
+	RET=0
+
+	packets=(2 2 2 2 2 4 4 4 4 4 1 1 1 1 1 3 3 3 3 3)
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::17
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::18
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::19
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::20
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::21
+
+	$MZ -6 $h1 -q -p 64 -b de:ad:be:ef:13:37 -t ip -c 1
+	flooding_check_packets "${packets[@]}"
+	log_test "flood after 4 packets"
+
+	# Delete the last record, one entry at a time and make sure single
+	# entries are correctly removed
+	RET=0
+
+	packets=(2 2 2 2 2 4 5 5 5 5 1 1 1 1 1 3 3 3 3 3)
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::7
+
+	$MZ -6 $h1 -q -p 64 -b de:ad:be:ef:13:37 -t ip -c 1
+	flooding_check_packets "${packets[@]}"
+	log_test "flood after 5 packets"
+
+	RET=0
+
+	packets=(2 2 2 2 2 4 5 6 6 6 1 1 1 1 1 3 3 3 3 3)
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::8
+
+	$MZ -6 $h1 -q -p 64 -b de:ad:be:ef:13:37 -t ip -c 1
+	flooding_check_packets "${packets[@]}"
+	log_test "flood after 6 packets"
+
+	RET=0
+
+	packets=(2 2 2 2 2 4 5 6 7 7 1 1 1 1 1 3 3 3 3 3)
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::9
+
+	$MZ -6 $h1 -q -p 64 -b de:ad:be:ef:13:37 -t ip -c 1
+	flooding_check_packets "${packets[@]}"
+	log_test "flood after 7 packets"
+
+	RET=0
+
+	packets=(2 2 2 2 2 4 5 6 7 8 1 1 1 1 1 3 3 3 3 3)
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::10
+
+	$MZ -6 $h1 -q -p 64 -b de:ad:be:ef:13:37 -t ip -c 1
+	flooding_check_packets "${packets[@]}"
+	log_test "flood after 8 packets"
+
+	RET=0
+
+	packets=(2 2 2 2 2 4 5 6 7 8 1 1 1 1 1 3 3 3 3 3)
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::11
+
+	$MZ -6 $h1 -q -p 64 -b de:ad:be:ef:13:37 -t ip -c 1
+	flooding_check_packets "${packets[@]}"
+	log_test "flood after 9 packets"
+
+	flooding_filters_del $num_remotes
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
-- 
cgit v1.2.3


From d01724dd2a66087ac6b3c4b45e25d475be7dd2f4 Mon Sep 17 00:00:00 2001
From: Amit Cohen <amcohen@nvidia.com>
Date: Thu, 23 Dec 2021 09:30:00 +0200
Subject: selftests: mlxsw: spectrum-2: Add a test for VxLAN flooding with IPv6

The device stores flood records in a singly linked list where each
record stores up to X IP addresses of remote VTEPs.

The number of records is changed according to ASIC type and address
family.

Add a test which is similar to the existing IPv4 test to check IPv6.
The test is dedicated for Spectrum-2 and above, which support up to four
IPv6 addresses in one record.

The test verifies that packets are correctly flooded in various cases such
as deletion of a record in the middle of the list.

Signed-off-by: Amit Cohen <amcohen@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/mlxsw/spectrum-2/vxlan_flooding_ipv6.sh    | 322 +++++++++++++++++++++
 1 file changed, 322 insertions(+)
 create mode 100755 tools/testing/selftests/drivers/net/mlxsw/spectrum-2/vxlan_flooding_ipv6.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/vxlan_flooding_ipv6.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/vxlan_flooding_ipv6.sh
new file mode 100755
index 000000000000..429f7ee735cf
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/vxlan_flooding_ipv6.sh
@@ -0,0 +1,322 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test VxLAN flooding. The device stores flood records in a singly linked list
+# where each record stores up to four IPv6 addresses of remote VTEPs. The test
+# verifies that packets are correctly flooded in various cases such as deletion
+# of a record in the middle of the list.
+#
+# +-----------------------+
+# | H1 (vrf)              |
+# |    + $h1              |
+# |    | 2001:db8:1::1/64 |
+# +----|------------------+
+#      |
+# +----|----------------------------------------------------------------------+
+# | SW |                                                                      |
+# | +--|--------------------------------------------------------------------+ |
+# | |  + $swp1                   BR0 (802.1d)                               | |
+# | |                                                                       | |
+# | |  + vxlan0 (vxlan)                                                     | |
+# | |    local 2001:db8:2::1                                                | |
+# | |    remote 2001:db8:2::{2..17}                                         | |
+# | |    id 10 dstport 4789                                                 | |
+# | +-----------------------------------------------------------------------+ |
+# |                                                                           |
+# |  2001:db8:2::0/64 via 2001:db8:3::2                                       |
+# |                                                                           |
+# |    + $rp1                                                                 |
+# |    | 2001:db8:3::1/64                                                     |
+# +----|----------------------------------------------------------------------+
+#      |
+# +----|--------------------------------------------------------+
+# |    |                                               R2 (vrf) |
+# |    + $rp2                                                   |
+# |      2001:db8:3::2/64                                       |
+# |                                                             |
+# +-------------------------------------------------------------+
+
+lib_dir=$(dirname $0)/../../../../net/forwarding
+
+ALL_TESTS="flooding_test"
+NUM_NETIFS=4
+source $lib_dir/tc_common.sh
+source $lib_dir/lib.sh
+
+h1_create()
+{
+	simple_if_init $h1 2001:db8:1::1/64
+}
+
+h1_destroy()
+{
+	simple_if_fini $h1 2001:db8:1::1/64
+}
+
+switch_create()
+{
+	# Make sure the bridge uses the MAC address of the local port and
+	# not that of the VxLAN's device
+	ip link add dev br0 type bridge mcast_snooping 0
+	ip link set dev br0 address $(mac_get $swp1)
+
+	ip link add name vxlan0 type vxlan id 10 nolearning \
+		udp6zerocsumrx udp6zerocsumtx ttl 20 tos inherit \
+		local 2001:db8:2::1 dstport 4789
+
+	ip address add 2001:db8:2::1/128 dev lo
+
+	ip link set dev $swp1 master br0
+	ip link set dev vxlan0 master br0
+
+	ip link set dev br0 up
+	ip link set dev $swp1 up
+	ip link set dev vxlan0 up
+}
+
+switch_destroy()
+{
+	ip link set dev vxlan0 down
+	ip link set dev $swp1 down
+	ip link set dev br0 down
+
+	ip link set dev vxlan0 nomaster
+	ip link set dev $swp1 nomaster
+
+	ip address del 2001:db8:2::1/128 dev lo
+
+	ip link del dev vxlan0
+
+	ip link del dev br0
+}
+
+router1_create()
+{
+	# This router is in the default VRF, where the VxLAN device is
+	# performing the L3 lookup
+	ip link set dev $rp1 up
+	ip address add 2001:db8:3::1/64 dev $rp1
+	ip route add 2001:db8:2::0/64 via 2001:db8:3::2
+}
+
+router1_destroy()
+{
+	ip route del 2001:db8:2::0/64 via 2001:db8:3::2
+	ip address del 2001:db8:3::1/64 dev $rp1
+	ip link set dev $rp1 down
+}
+
+router2_create()
+{
+	# This router is not in the default VRF, so use simple_if_init()
+	simple_if_init $rp2 2001:db8:3::2/64
+}
+
+router2_destroy()
+{
+	simple_if_fini $rp2 2001:db8:3::2/64
+}
+
+setup_prepare()
+{
+	h1=${NETIFS[p1]}
+	swp1=${NETIFS[p2]}
+
+	rp1=${NETIFS[p3]}
+	rp2=${NETIFS[p4]}
+
+	vrf_prepare
+
+	h1_create
+
+	switch_create
+
+	router1_create
+	router2_create
+
+	forwarding_enable
+}
+
+cleanup()
+{
+	pre_cleanup
+
+	forwarding_restore
+
+	router2_destroy
+	router1_destroy
+
+	switch_destroy
+
+	h1_destroy
+
+	vrf_cleanup
+}
+
+flooding_remotes_add()
+{
+	local num_remotes=$1
+	local lsb
+	local i
+
+	for i in $(eval echo {1..$num_remotes}); do
+		lsb=$((i + 1))
+
+		bridge fdb append 00:00:00:00:00:00 dev vxlan0 self \
+			dst 2001:db8:2::$lsb
+	done
+}
+
+flooding_filters_add()
+{
+	local num_remotes=$1
+	local lsb
+	local i
+
+	tc qdisc add dev $rp2 clsact
+
+	for i in $(eval echo {1..$num_remotes}); do
+		lsb=$((i + 1))
+
+		tc filter add dev $rp2 ingress protocol ipv6 pref $i handle $i \
+			flower ip_proto udp dst_ip 2001:db8:2::$lsb \
+			dst_port 4789 skip_sw action drop
+	done
+}
+
+flooding_filters_del()
+{
+	local num_remotes=$1
+	local i
+
+	for i in $(eval echo {1..$num_remotes}); do
+		tc filter del dev $rp2 ingress protocol ipv6 pref $i \
+			handle $i flower
+	done
+
+	tc qdisc del dev $rp2 clsact
+}
+
+flooding_check_packets()
+{
+	local packets=("$@")
+	local num_remotes=${#packets[@]}
+	local i
+
+	for i in $(eval echo {1..$num_remotes}); do
+		tc_check_packets "dev $rp2 ingress" $i ${packets[i - 1]}
+		check_err $? "remote $i - did not get expected number of packets"
+	done
+}
+
+flooding_test()
+{
+	# Use 16 remote VTEPs that will be stored in 4 records. The array
+	# 'packets' will store how many packets are expected to be received
+	# by each remote VTEP at each stage of the test
+	declare -a packets=(1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1)
+	local num_remotes=16
+
+	RET=0
+
+	# Add FDB entries for remote VTEPs and corresponding tc filters on the
+	# ingress of the nexthop router. These filters will count how many
+	# packets were flooded to each remote VTEP
+	flooding_remotes_add $num_remotes
+	flooding_filters_add $num_remotes
+
+	# Send one packet and make sure it is flooded to all the remote VTEPs
+	$MZ $h1 -q -p 64 -b de:ad:be:ef:13:37 -t ip -c 1
+	flooding_check_packets "${packets[@]}"
+	log_test "flood after 1 packet"
+
+	# Delete the third record which corresponds to VTEPs with LSB 10..13
+	# and check that packet is flooded correctly when we remove a record
+	# from the middle of the list
+	RET=0
+
+	packets=(2 2 2 2 2 2 2 2 1 1 1 1 2 2 2 2)
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::10
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::11
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::12
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::13
+
+	$MZ $h1 -q -p 64 -b de:ad:be:ef:13:37 -t ip -c 1
+	flooding_check_packets "${packets[@]}"
+	log_test "flood after 2 packets"
+
+	# Delete the first record and make sure the packet is flooded correctly
+	RET=0
+
+	packets=(2 2 2 2 3 3 3 3 1 1 1 1 3 3 3 3)
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::2
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::3
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::4
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::5
+
+	$MZ $h1 -q -p 64 -b de:ad:be:ef:13:37 -t ip -c 1
+	flooding_check_packets "${packets[@]}"
+	log_test "flood after 3 packets"
+
+	# Delete the last record and make sure the packet is flooded correctly
+	RET=0
+
+	packets=(2 2 2 2 4 4 4 4 1 1 1 1 3 3 3 3)
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::14
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::15
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::16
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::17
+
+	$MZ -6 $h1 -q -p 64 -b de:ad:be:ef:13:37 -t ip -c 1
+	flooding_check_packets "${packets[@]}"
+	log_test "flood after 4 packets"
+
+	# Delete the last record, one entry at a time and make sure single
+	# entries are correctly removed
+	RET=0
+
+	packets=(2 2 2 2 4 5 5 5 1 1 1 1 3 3 3 3)
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::6
+
+	$MZ -6 $h1 -q -p 64 -b de:ad:be:ef:13:37 -t ip -c 1
+	flooding_check_packets "${packets[@]}"
+	log_test "flood after 5 packets"
+
+	RET=0
+
+	packets=(2 2 2 2 4 5 6 6 1 1 1 1 3 3 3 3)
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::7
+
+	$MZ -6 $h1 -q -p 64 -b de:ad:be:ef:13:37 -t ip -c 1
+	flooding_check_packets "${packets[@]}"
+	log_test "flood after 6 packets"
+
+	RET=0
+
+	packets=(2 2 2 2 4 5 6 7 1 1 1 1 3 3 3 3)
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::8
+
+	$MZ -6 $h1 -q -p 64 -b de:ad:be:ef:13:37 -t ip -c 1
+	flooding_check_packets "${packets[@]}"
+	log_test "flood after 7 packets"
+
+	RET=0
+
+	packets=(2 2 2 2 4 5 6 7 1 1 1 1 3 3 3 3)
+	bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 2001:db8:2::9
+
+	$MZ -6 $h1 -q -p 64 -b de:ad:be:ef:13:37 -t ip -c 1
+	flooding_check_packets "${packets[@]}"
+	log_test "flood after 8 packets"
+
+	flooding_filters_del $num_remotes
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
-- 
cgit v1.2.3


From c777d726267c0a5ca9216a5f17ed12d67bc97c99 Mon Sep 17 00:00:00 2001
From: Amit Cohen <amcohen@nvidia.com>
Date: Thu, 23 Dec 2021 09:30:01 +0200
Subject: selftests: mlxsw: Add test for VxLAN related traps for IPv6

The test configures VxLAN with IPv6 underlay and verifies that the
expected traps are triggered under the right conditions.

The test is similar to the existing IPv4 test.

Signed-off-by: Amit Cohen <amcohen@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/mlxsw/devlink_trap_tunnel_vxlan_ipv6.sh    | 342 +++++++++++++++++++++
 1 file changed, 342 insertions(+)
 create mode 100755 tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan_ipv6.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan_ipv6.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan_ipv6.sh
new file mode 100755
index 000000000000..f6c16cbb6cf7
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan_ipv6.sh
@@ -0,0 +1,342 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test devlink-trap tunnel drops and exceptions functionality over mlxsw.
+# Check all traps to make sure they are triggered under the right
+# conditions.
+
+# +------------------------+
+# | H1 (vrf)               |
+# |    + $h1               |
+# |    | 2001:db8:1::1/64  |
+# +----|-------------------+
+#      |
+# +----|----------------------------------------------------------------------+
+# | SW |                                                                      |
+# | +--|--------------------------------------------------------------------+ |
+# | |  + $swp1                   BR1 (802.1d)                               | |
+# | |                                                                       | |
+# | |  + vx1 (vxlan)                                                        | |
+# | |    local 2001:db8:3::1                                                | |
+# | |    id 1000 dstport $VXPORT                                            | |
+# | +-----------------------------------------------------------------------+ |
+# |                                                                           |
+# |    + $rp1                                                                 |
+# |    | 2001:db8:3::1/64                                                     |
+# +----|----------------------------------------------------------------------+
+#      |
+# +----|--------------------------------------------------------+
+# |    |                                             VRF2       |
+# |    + $rp2                                                   |
+# |      2001:db8:3::2/64                                       |
+# |                                                             |
+# +-------------------------------------------------------------+
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+ALL_TESTS="
+	decap_error_test
+	overlay_smac_is_mc_test
+"
+
+NUM_NETIFS=4
+source $lib_dir/lib.sh
+source $lib_dir/tc_common.sh
+source $lib_dir/devlink_lib.sh
+
+: ${VXPORT:=4789}
+export VXPORT
+
+h1_create()
+{
+	simple_if_init $h1 2001:db8:1::1/64
+}
+
+h1_destroy()
+{
+	simple_if_fini $h1 2001:db8:1::1/64
+}
+
+switch_create()
+{
+	ip link add name br1 type bridge vlan_filtering 0 mcast_snooping 0
+	# Make sure the bridge uses the MAC address of the local port and not
+	# that of the VxLAN's device.
+	ip link set dev br1 address $(mac_get $swp1)
+	ip link set dev br1 up
+
+	tc qdisc add dev $swp1 clsact
+	ip link set dev $swp1 master br1
+	ip link set dev $swp1 up
+
+	ip link add name vx1 type vxlan id 1000 local 2001:db8:3::1 \
+		dstport "$VXPORT" nolearning udp6zerocsumrx udp6zerocsumtx \
+		tos inherit ttl 100
+	ip link set dev vx1 master br1
+	ip link set dev vx1 up
+
+	ip link set dev $rp1 up
+	ip address add dev $rp1 2001:db8:3::1/64
+}
+
+switch_destroy()
+{
+	ip address del dev $rp1 2001:db8:3::1/64
+	ip link set dev $rp1 down
+
+	ip link set dev vx1 down
+	ip link set dev vx1 nomaster
+	ip link del dev vx1
+
+	ip link set dev $swp1 down
+	ip link set dev $swp1 nomaster
+	tc qdisc del dev $swp1 clsact
+
+	ip link set dev br1 down
+	ip link del dev br1
+}
+
+vrf2_create()
+{
+	simple_if_init $rp2 2001:db8:3::2/64
+}
+
+vrf2_destroy()
+{
+	simple_if_fini $rp2 2001:db8:3::2/64
+}
+
+setup_prepare()
+{
+	h1=${NETIFS[p1]}
+	swp1=${NETIFS[p2]}
+
+	rp1=${NETIFS[p3]}
+	rp2=${NETIFS[p4]}
+
+	vrf_prepare
+	forwarding_enable
+	h1_create
+	switch_create
+	vrf2_create
+}
+
+cleanup()
+{
+	pre_cleanup
+
+	vrf2_destroy
+	switch_destroy
+	h1_destroy
+	forwarding_restore
+	vrf_cleanup
+}
+
+ecn_payload_get()
+{
+	local dest_mac=$(mac_get $h1)
+	local saddr="20:01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:03"
+	local daddr="20:01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:01"
+	p=$(:
+		)"08:"$(                      : VXLAN flags
+		)"00:00:00:"$(                : VXLAN reserved
+		)"00:03:e8:"$(                : VXLAN VNI : 1000
+		)"00:"$(                      : VXLAN reserved
+		)"$dest_mac:"$(               : ETH daddr
+		)"00:00:00:00:00:00:"$(       : ETH saddr
+		)"86:dd:"$(                   : ETH type
+		)"6"$(                        : IP version
+		)"0:0"$(		      : Traffic class
+		)"0:00:00:"$(                 : Flow label
+		)"00:08:"$(                   : Payload length
+		)"3a:"$(                      : Next header
+		)"04:"$(                      : Hop limit
+		)"$saddr:"$(                  : IP saddr
+		)"$daddr:"$(		      : IP daddr
+		)"80:"$(                      : ICMPv6.type
+		)"00:"$(                      : ICMPv6.code
+		)"00:"$(                      : ICMPv6.checksum
+		)
+	echo $p
+}
+
+ecn_decap_test()
+{
+	local trap_name="decap_error"
+	local desc=$1; shift
+	local ecn_desc=$1; shift
+	local outer_tos=$1; shift
+	local mz_pid
+
+	RET=0
+
+	tc filter add dev $swp1 egress protocol ipv6 pref 1 handle 101 \
+		flower src_ip 2001:db8:1::3 dst_ip 2001:db8:1::1 action pass
+
+	rp1_mac=$(mac_get $rp1)
+	payload=$(ecn_payload_get)
+
+	ip vrf exec v$rp2 $MZ -6 $rp2 -c 0 -d 1msec -b $rp1_mac \
+		-B 2001:db8:3::1 -t udp \
+		sp=12345,dp=$VXPORT,tos=$outer_tos,p=$payload -q &
+	mz_pid=$!
+
+	devlink_trap_exception_test $trap_name
+
+	tc_check_packets "dev $swp1 egress" 101 0
+	check_err $? "Packets were not dropped"
+
+	log_test "$desc: Inner ECN is not ECT and outer is $ecn_desc"
+
+	kill $mz_pid && wait $mz_pid &> /dev/null
+	tc filter del dev $swp1 egress protocol ipv6 pref 1 handle 101 flower
+}
+
+reserved_bits_payload_get()
+{
+	local dest_mac=$(mac_get $h1)
+	local saddr="20:01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:03"
+	local daddr="20:01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:01"
+	p=$(:
+		)"08:"$(                      : VXLAN flags
+		)"01:00:00:"$(                : VXLAN reserved
+		)"00:03:e8:"$(                : VXLAN VNI : 1000
+		)"00:"$(                      : VXLAN reserved
+		)"$dest_mac:"$(               : ETH daddr
+		)"00:00:00:00:00:00:"$(       : ETH saddr
+		)"86:dd:"$(                   : ETH type
+		)"6"$(                        : IP version
+		)"0:0"$(		      : Traffic class
+		)"0:00:00:"$(                 : Flow label
+		)"00:08:"$(                   : Payload length
+		)"3a:"$(                      : Next header
+		)"04:"$(                      : Hop limit
+		)"$saddr:"$(                  : IP saddr
+		)"$daddr:"$(		      : IP daddr
+		)"80:"$(                      : ICMPv6.type
+		)"00:"$(                      : ICMPv6.code
+		)"00:"$(                      : ICMPv6.checksum
+		)
+	echo $p
+}
+
+short_payload_get()
+{
+        dest_mac=$(mac_get $h1)
+        p=$(:
+		)"08:"$(                      : VXLAN flags
+		)"00:00:00:"$(                : VXLAN reserved
+		)"00:03:e8:"$(                : VXLAN VNI : 1000
+		)"00:"$(                      : VXLAN reserved
+		)"$dest_mac:"$(               : ETH daddr
+		)"00:00:00:00:00:00:"$(       : ETH saddr
+		)
+        echo $p
+}
+
+corrupted_packet_test()
+{
+	local trap_name="decap_error"
+	local desc=$1; shift
+	local payload_get=$1; shift
+	local mz_pid
+
+	RET=0
+
+	# In case of too short packet, there is no any inner packet,
+	# so the matching will always succeed
+	tc filter add dev $swp1 egress protocol ipv6 pref 1 handle 101 \
+		flower skip_hw src_ip 2001:db8:3::1 dst_ip 2001:db8:1::1 \
+		action pass
+
+	rp1_mac=$(mac_get $rp1)
+	payload=$($payload_get)
+	ip vrf exec v$rp2 $MZ -6 $rp2 -c 0 -d 1msec -b $rp1_mac \
+		-B 2001:db8:3::1 -t udp sp=12345,dp=$VXPORT,p=$payload -q &
+	mz_pid=$!
+
+	devlink_trap_exception_test $trap_name
+
+	tc_check_packets "dev $swp1 egress" 101 0
+	check_err $? "Packets were not dropped"
+
+	log_test "$desc"
+
+	kill $mz_pid && wait $mz_pid &> /dev/null
+	tc filter del dev $swp1 egress protocol ipv6 pref 1 handle 101 flower
+}
+
+decap_error_test()
+{
+	ecn_decap_test "Decap error" "ECT(1)" 01
+	ecn_decap_test "Decap error" "ECT(0)" 02
+	ecn_decap_test "Decap error" "CE" 03
+
+	corrupted_packet_test "Decap error: Reserved bits in use" \
+		"reserved_bits_payload_get"
+	corrupted_packet_test "Decap error: Too short inner packet" \
+		"short_payload_get"
+}
+
+mc_smac_payload_get()
+{
+	local dest_mac=$(mac_get $h1)
+	local source_mac="01:02:03:04:05:06"
+	local saddr="20:01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:03"
+	local daddr="20:01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:01"
+	p=$(:
+		)"08:"$(                      : VXLAN flags
+		)"00:00:00:"$(                : VXLAN reserved
+		)"00:03:e8:"$(                : VXLAN VNI : 1000
+		)"00:"$(                      : VXLAN reserved
+		)"$dest_mac:"$(               : ETH daddr
+		)"$source_mac:"$(	      : ETH saddr
+		)"86:dd:"$(                   : ETH type
+		)"6"$(                        : IP version
+		)"0:0"$(		      : Traffic class
+		)"0:00:00:"$(                 : Flow label
+		)"00:08:"$(                   : Payload length
+		)"3a:"$(                      : Next header
+		)"04:"$(                      : Hop limit
+		)"$saddr:"$(                  : IP saddr
+		)"$daddr:"$(		      : IP daddr
+		)"80:"$(                      : ICMPv6.type
+		)"00:"$(                      : ICMPv6.code
+		)"00:"$(                      : ICMPv6.checksum
+		)
+	echo $p
+}
+
+overlay_smac_is_mc_test()
+{
+	local trap_name="overlay_smac_is_mc"
+	local mz_pid
+
+	RET=0
+
+	# The matching will be checked on devlink_trap_drop_test()
+	# and the filter will be removed on devlink_trap_drop_cleanup()
+	tc filter add dev $swp1 egress protocol ipv6 pref 1 handle 101 \
+		flower src_mac 01:02:03:04:05:06 action pass
+
+	rp1_mac=$(mac_get $rp1)
+	payload=$(mc_smac_payload_get)
+
+	ip vrf exec v$rp2 $MZ -6 $rp2 -c 0 -d 1msec -b $rp1_mac \
+		-B 2001:db8:3::1 -t udp sp=12345,dp=$VXPORT,p=$payload -q &
+	mz_pid=$!
+
+	devlink_trap_drop_test $trap_name $swp1 101
+
+	log_test "Overlay source MAC is multicast"
+
+	devlink_trap_drop_cleanup $mz_pid $swp1 "ipv6" 1 101
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+tests_run
+
+exit $EXIT_STATUS
-- 
cgit v1.2.3


From 810ef9552dece4e9bb3ef37d008dc2cf4b317dc6 Mon Sep 17 00:00:00 2001
From: Amit Cohen <amcohen@nvidia.com>
Date: Thu, 23 Dec 2021 09:30:02 +0200
Subject: selftests: mlxsw: devlink_trap_tunnel_vxlan: Fix 'decap_error' case

Change the case that sends packets with "too short inner packet" to
include part of ethernet header, to make the trap to be triggered due to
the correct reason.

According to ASIC arch, the trap is triggered if overlay packet length is
less than 18B, and the minimum inner packet should include source MAC and
destination MAC.

Till now the case passed because one of the reserved bits in VxLAN
header was used. This issue was found while adding an equivalent test
for IPv6.

Signed-off-by: Amit Cohen <amcohen@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh       | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh
index 10e0f3dbc930..5f6eb965cfd1 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh
@@ -217,9 +217,11 @@ short_payload_get()
         dest_mac=$(mac_get $h1)
         p=$(:
 		)"08:"$(                      : VXLAN flags
-		)"01:00:00:"$(                : VXLAN reserved
+		)"00:00:00:"$(                : VXLAN reserved
 		)"00:03:e8:"$(                : VXLAN VNI : 1000
 		)"00:"$(                      : VXLAN reserved
+		)"$dest_mac:"$(               : ETH daddr
+		)"00:00:00:00:00:00:"$(       : ETH saddr
 		)
         echo $p
 }
@@ -263,7 +265,8 @@ decap_error_test()
 
 	corrupted_packet_test "Decap error: Reserved bits in use" \
 		"reserved_bits_payload_get"
-	corrupted_packet_test "Decap error: No L2 header" "short_payload_get"
+	corrupted_packet_test "Decap error: Too short inner packet" \
+		"short_payload_get"
 }
 
 mc_smac_payload_get()
-- 
cgit v1.2.3


From 5652b807b7576f14c8b96e769470affef3287b7e Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Thu, 23 Dec 2021 14:17:35 +0100
Subject: libbpf: Do not use btf_dump__new() macro in C++ mode

As reported in here [0], C++ compilers don't support
__builtin_types_compatible_p(), so at least don't screw up compilation
for them and let C++ users pick btf_dump__new vs
btf_dump__new_deprecated explicitly.

  [0] https://github.com/libbpf/libbpf/issues/283#issuecomment-986100727

Fixes: 6084f5dc928f ("libbpf: Ensure btf_dump__new() and btf_dump_opts are future-proof")
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20211223131736.483956-1-jolsa@kernel.org
---
 tools/lib/bpf/btf.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index 742a2bf71c5e..061839f04525 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -313,12 +313,18 @@ LIBBPF_API struct btf_dump *btf_dump__new_deprecated(const struct btf *btf,
  *
  * The rest works just like in case of ___libbpf_override() usage with symbol
  * versioning.
+ *
+ * C++ compilers don't support __builtin_types_compatible_p(), so at least
+ * don't screw up compilation for them and let C++ users pick btf_dump__new
+ * vs btf_dump__new_deprecated explicitly.
  */
+#ifndef __cplusplus
 #define btf_dump__new(a1, a2, a3, a4) __builtin_choose_expr(				\
 	__builtin_types_compatible_p(typeof(a4), btf_dump_printf_fn_t) ||		\
 	__builtin_types_compatible_p(typeof(a4), void(void *, const char *, va_list)),	\
 	btf_dump__new_deprecated((void *)a1, (void *)a2, (void *)a3, (void *)a4),	\
 	btf_dump__new((void *)a1, (void *)a2, (void *)a3, (void *)a4))
+#endif
 
 LIBBPF_API void btf_dump__free(struct btf_dump *d);
 
-- 
cgit v1.2.3


From ecf45e60a62dfeb65658abac02f0bdb45b786911 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Thu, 23 Dec 2021 14:17:36 +0100
Subject: selftests/bpf: Add btf_dump__new to test_cpp

Adding btf_dump__new call to test_cpp, so we can
test C++ compilation with that.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20211223131736.483956-2-jolsa@kernel.org
---
 tools/testing/selftests/bpf/test_cpp.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_cpp.cpp b/tools/testing/selftests/bpf/test_cpp.cpp
index a8d2e9a87fbf..e00201de2890 100644
--- a/tools/testing/selftests/bpf/test_cpp.cpp
+++ b/tools/testing/selftests/bpf/test_cpp.cpp
@@ -7,9 +7,15 @@
 
 /* do nothing, just make sure we can link successfully */
 
+static void dump_printf(void *ctx, const char *fmt, va_list args)
+{
+}
+
 int main(int argc, char *argv[])
 {
+	struct btf_dump_opts opts = { };
 	struct test_core_extern *skel;
+	struct btf *btf;
 
 	/* libbpf.h */
 	libbpf_set_print(NULL);
@@ -18,7 +24,8 @@ int main(int argc, char *argv[])
 	bpf_prog_get_fd_by_id(0);
 
 	/* btf.h */
-	btf__new(NULL, 0);
+	btf = btf__new(NULL, 0);
+	btf_dump__new(btf, dump_printf, nullptr, &opts);
 
 	/* BPF skeleton */
 	skel = test_core_extern__open_and_load();
-- 
cgit v1.2.3


From 5471d5226c3b39b3d2f7011c082d5715795bd65c Mon Sep 17 00:00:00 2001
From: Coco Li <lixiaoyan@google.com>
Date: Thu, 23 Dec 2021 22:24:41 +0000
Subject: selftests: Calculate udpgso segment count without header adjustment

The below referenced commit correctly updated the computation of number
of segments (gso_size) by using only the gso payload size and
removing the header lengths.

With this change the regression test started failing. Update
the tests to match this new behavior.

Both IPv4 and IPv6 tests are updated, as a separate patch in this series
will update udp_v6_send_skb to match this change in udp_send_skb.

Fixes: 158390e45612 ("udp: using datalen to cap max gso segments")
Signed-off-by: Coco Li <lixiaoyan@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://lore.kernel.org/r/20211223222441.2975883-2-lixiaoyan@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/udpgso.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c
index c66da6ffd6d8..7badaf215de2 100644
--- a/tools/testing/selftests/net/udpgso.c
+++ b/tools/testing/selftests/net/udpgso.c
@@ -156,13 +156,13 @@ struct testcase testcases_v4[] = {
 	},
 	{
 		/* send max number of min sized segments */
-		.tlen = UDP_MAX_SEGMENTS - CONST_HDRLEN_V4,
+		.tlen = UDP_MAX_SEGMENTS,
 		.gso_len = 1,
-		.r_num_mss = UDP_MAX_SEGMENTS - CONST_HDRLEN_V4,
+		.r_num_mss = UDP_MAX_SEGMENTS,
 	},
 	{
 		/* send max number + 1 of min sized segments: fail */
-		.tlen = UDP_MAX_SEGMENTS - CONST_HDRLEN_V4 + 1,
+		.tlen = UDP_MAX_SEGMENTS + 1,
 		.gso_len = 1,
 		.tfail = true,
 	},
@@ -259,13 +259,13 @@ struct testcase testcases_v6[] = {
 	},
 	{
 		/* send max number of min sized segments */
-		.tlen = UDP_MAX_SEGMENTS - CONST_HDRLEN_V6,
+		.tlen = UDP_MAX_SEGMENTS,
 		.gso_len = 1,
-		.r_num_mss = UDP_MAX_SEGMENTS - CONST_HDRLEN_V6,
+		.r_num_mss = UDP_MAX_SEGMENTS,
 	},
 	{
 		/* send max number + 1 of min sized segments: fail */
-		.tlen = UDP_MAX_SEGMENTS - CONST_HDRLEN_V6 + 1,
+		.tlen = UDP_MAX_SEGMENTS + 1,
 		.gso_len = 1,
 		.tfail = true,
 	},
-- 
cgit v1.2.3


From 9734213ed413da5ac791a984c8cecf1612fe4888 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Wed, 15 Dec 2021 17:28:02 -0800
Subject: tools/power/x86/intel-speed-select: Update max frequency

When BIOS disables turbo, the cpuinfo_max_freq will also be same as the
power up base frequency. When SST-PP causes increase in base frequency
the performance will be still limited to the old base frequency as the
cpuinfo_max_freq will not be updated.

In this case we need to update scaling_max frequency to the new
base_frequency. This will result in setting updated max performance
limit in the Pstate driver. So performance will not be limited to the
old base frequency.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 tools/power/x86/intel-speed-select/isst-config.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c
index bf9fd3549a1d..7967a04559e1 100644
--- a/tools/power/x86/intel-speed-select/isst-config.c
+++ b/tools/power/x86/intel-speed-select/isst-config.c
@@ -1599,6 +1599,7 @@ static void set_scaling_min_to_cpuinfo_max(int cpu)
 		    die_id != get_physical_die_id(i))
 			continue;
 
+		adjust_scaling_max_from_base_freq(i);
 		set_cpufreq_scaling_min_max_from_cpuinfo(i, 1, 0);
 		adjust_scaling_min_from_base_freq(i);
 	}
@@ -1615,6 +1616,7 @@ static void set_scaling_min_to_cpuinfo_min(int cpu)
 		    die_id != get_physical_die_id(i))
 			continue;
 
+		adjust_scaling_max_from_base_freq(i);
 		set_cpufreq_scaling_min_max_from_cpuinfo(i, 0, 0);
 	}
 }
-- 
cgit v1.2.3


From da78fc797fa4126f626303fe4d6cb474c1a80d26 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Wed, 15 Dec 2021 17:46:56 -0800
Subject: tools/power/x86/intel-speed-select: v1.11 release

This release adds following change:
- Update max performance when BIOS disabled turbo

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 tools/power/x86/intel-speed-select/isst-config.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c
index 7967a04559e1..efe72fa48224 100644
--- a/tools/power/x86/intel-speed-select/isst-config.c
+++ b/tools/power/x86/intel-speed-select/isst-config.c
@@ -15,7 +15,7 @@ struct process_cmd_struct {
 	int arg;
 };
 
-static const char *version_str = "v1.10";
+static const char *version_str = "v1.11";
 static const int supported_api_ver = 1;
 static struct isst_if_platform_info isst_platform_info;
 static char *progname;
-- 
cgit v1.2.3


From beeac538c366cd2828092adecd1edab28326c55b Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 22 Dec 2021 00:51:01 +1100
Subject: selftests/powerpc: Add a test of sigreturning to an unaligned address

Add a test of sigreturning to an unaligned address (low two bits set).
This should have no effect because the hardware will mask those bits.
However it previously falsely triggered a warning when
CONFIG_PPC_RFI_SRR_DEBUG=y.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20211221135101.2085547-3-mpe@ellerman.id.au
---
 tools/testing/selftests/powerpc/signal/.gitignore  |  1 +
 tools/testing/selftests/powerpc/signal/Makefile    |  1 +
 .../selftests/powerpc/signal/sigreturn_unaligned.c | 43 ++++++++++++++++++++++
 3 files changed, 45 insertions(+)
 create mode 100644 tools/testing/selftests/powerpc/signal/sigreturn_unaligned.c

(limited to 'tools')

diff --git a/tools/testing/selftests/powerpc/signal/.gitignore b/tools/testing/selftests/powerpc/signal/.gitignore
index 8f6c816099a4..9d0915777fed 100644
--- a/tools/testing/selftests/powerpc/signal/.gitignore
+++ b/tools/testing/selftests/powerpc/signal/.gitignore
@@ -5,3 +5,4 @@ sigfuz
 sigreturn_vdso
 sig_sc_double_restart
 sigreturn_kernel
+sigreturn_unaligned
diff --git a/tools/testing/selftests/powerpc/signal/Makefile b/tools/testing/selftests/powerpc/signal/Makefile
index 84e201572466..f679d260afc8 100644
--- a/tools/testing/selftests/powerpc/signal/Makefile
+++ b/tools/testing/selftests/powerpc/signal/Makefile
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 TEST_GEN_PROGS := signal signal_tm sigfuz sigreturn_vdso sig_sc_double_restart
 TEST_GEN_PROGS += sigreturn_kernel
+TEST_GEN_PROGS += sigreturn_unaligned
 
 CFLAGS += -maltivec
 $(OUTPUT)/signal_tm: CFLAGS += -mhtm
diff --git a/tools/testing/selftests/powerpc/signal/sigreturn_unaligned.c b/tools/testing/selftests/powerpc/signal/sigreturn_unaligned.c
new file mode 100644
index 000000000000..6e58ee4f0fdf
--- /dev/null
+++ b/tools/testing/selftests/powerpc/signal/sigreturn_unaligned.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test sigreturn to an unaligned address, ie. low 2 bits set.
+ * Nothing bad should happen.
+ * This was able to trigger warnings with CONFIG_PPC_RFI_SRR_DEBUG=y.
+ */
+
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ucontext.h>
+#include <unistd.h>
+
+#include "utils.h"
+
+
+static void sigusr1_handler(int signo, siginfo_t *info, void *ptr)
+{
+	ucontext_t *uc = ptr;
+
+	UCONTEXT_NIA(uc) |= 3;
+}
+
+static int test_sigreturn_unaligned(void)
+{
+	struct sigaction action;
+
+	memset(&action, 0, sizeof(action));
+	action.sa_sigaction = sigusr1_handler;
+	action.sa_flags = SA_SIGINFO;
+
+	FAIL_IF(sigaction(SIGUSR1, &action, NULL) == -1);
+
+	raise(SIGUSR1);
+
+	return 0;
+}
+
+int main(void)
+{
+	return test_harness(test_sigreturn_unaligned, "sigreturn_unaligned");
+}
-- 
cgit v1.2.3


From 3f48b137d88e710b67b2bcc01aa3d77b4db610c4 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 17 Dec 2021 13:02:12 +0000
Subject: kselftest: alsa: Factor out check that values meet constraints

To simplify the code a bit and allow future reuse factor the checks that
values we read are valid out of test_ctl_get_value() into a separate
function which can be reused later. As part of this extend the test to
check all the values for the control, not just the first one.

Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Cezary Rojewski <cezary.rojewski@intel.com>
Link: https://lore.kernel.org/r/20211217130213.3893415-2-broonie@kernel.org
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 tools/testing/selftests/alsa/mixer-test.c | 141 +++++++++++++++++-------------
 1 file changed, 82 insertions(+), 59 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/alsa/mixer-test.c b/tools/testing/selftests/alsa/mixer-test.c
index b798a76f6825..b009fc5df605 100644
--- a/tools/testing/selftests/alsa/mixer-test.c
+++ b/tools/testing/selftests/alsa/mixer-test.c
@@ -193,124 +193,147 @@ void find_controls(void)
 	snd_config_delete(config);
 }
 
-/*
- * Check that we can read the default value and it is valid. Write
- * tests use the read value to restore the default.
- */
-void test_ctl_get_value(struct ctl_data *ctl)
+bool ctl_value_index_valid(struct ctl_data *ctl, snd_ctl_elem_value_t *val,
+			   int index)
 {
-	int err;
 	long int_val;
 	long long int64_val;
 
-	/* If the control is turned off let's be polite */
-	if (snd_ctl_elem_info_is_inactive(ctl->info)) {
-		ksft_print_msg("%s is inactive\n", ctl->name);
-		ksft_test_result_skip("get_value.%d.%d\n",
-				      ctl->card->card, ctl->elem);
-		return;
-	}
-
-	/* Can't test reading on an unreadable control */
-	if (!snd_ctl_elem_info_is_readable(ctl->info)) {
-		ksft_print_msg("%s is not readable\n", ctl->name);
-		ksft_test_result_skip("get_value.%d.%d\n",
-				      ctl->card->card, ctl->elem);
-		return;
-	}
-
-	err = snd_ctl_elem_read(ctl->card->handle, ctl->def_val);
-	if (err < 0) {
-		ksft_print_msg("snd_ctl_elem_read() failed: %s\n",
-			       snd_strerror(err));
-		goto out;
-	}
-
 	switch (snd_ctl_elem_info_get_type(ctl->info)) {
 	case SND_CTL_ELEM_TYPE_NONE:
-		ksft_print_msg("%s Invalid control type NONE\n", ctl->name);
-		err = -1;
-		break;
+		ksft_print_msg("%s.%d Invalid control type NONE\n",
+			       ctl->name, index);
+		return false;
 
 	case SND_CTL_ELEM_TYPE_BOOLEAN:
-		int_val = snd_ctl_elem_value_get_boolean(ctl->def_val, 0);
+		int_val = snd_ctl_elem_value_get_boolean(val, index);
 		switch (int_val) {
 		case 0:
 		case 1:
 			break;
 		default:
-			ksft_print_msg("%s Invalid boolean value %ld\n",
-				       ctl->name, int_val);
-			err = -1;
-			break;
+			ksft_print_msg("%s.%d Invalid boolean value %ld\n",
+				       ctl->name, index, int_val);
+			return false;
 		}
 		break;
 
 	case SND_CTL_ELEM_TYPE_INTEGER:
-		int_val = snd_ctl_elem_value_get_integer(ctl->def_val, 0);
+		int_val = snd_ctl_elem_value_get_integer(val, index);
 
 		if (int_val < snd_ctl_elem_info_get_min(ctl->info)) {
-			ksft_print_msg("%s value %ld less than minimum %ld\n",
-				       ctl->name, int_val,
+			ksft_print_msg("%s.%d value %ld less than minimum %ld\n",
+				       ctl->name, index, int_val,
 				       snd_ctl_elem_info_get_min(ctl->info));
-			err = -1;
+			return false;
 		}
 
 		if (int_val > snd_ctl_elem_info_get_max(ctl->info)) {
-			ksft_print_msg("%s value %ld more than maximum %ld\n",
-				       ctl->name, int_val,
+			ksft_print_msg("%s.%d value %ld more than maximum %ld\n",
+				       ctl->name, index, int_val,
 				       snd_ctl_elem_info_get_max(ctl->info));
-			err = -1;
+			return false;
 		}
 
 		/* Only check step size if there is one and we're in bounds */
-		if (err >= 0 && snd_ctl_elem_info_get_step(ctl->info) &&
+		if (snd_ctl_elem_info_get_step(ctl->info) &&
 		    (int_val - snd_ctl_elem_info_get_min(ctl->info) %
 		     snd_ctl_elem_info_get_step(ctl->info))) {
-			ksft_print_msg("%s value %ld invalid for step %ld minimum %ld\n",
-				       ctl->name, int_val,
+			ksft_print_msg("%s.%d value %ld invalid for step %ld minimum %ld\n",
+				       ctl->name, index, int_val,
 				       snd_ctl_elem_info_get_step(ctl->info),
 				       snd_ctl_elem_info_get_min(ctl->info));
-			err = -1;
+			return false;
 		}
 		break;
 
 	case SND_CTL_ELEM_TYPE_INTEGER64:
-		int64_val = snd_ctl_elem_value_get_integer64(ctl->def_val, 0);
+		int64_val = snd_ctl_elem_value_get_integer64(val, index);
 
 		if (int64_val < snd_ctl_elem_info_get_min64(ctl->info)) {
-			ksft_print_msg("%s value %lld less than minimum %lld\n",
-				       ctl->name, int64_val,
+			ksft_print_msg("%s.%d value %lld less than minimum %lld\n",
+				       ctl->name, index, int64_val,
 				       snd_ctl_elem_info_get_min64(ctl->info));
-			err = -1;
+			return false;
 		}
 
 		if (int64_val > snd_ctl_elem_info_get_max64(ctl->info)) {
-			ksft_print_msg("%s value %lld more than maximum %lld\n",
-				       ctl->name, int64_val,
+			ksft_print_msg("%s.%d value %lld more than maximum %lld\n",
+				       ctl->name, index, int64_val,
 				       snd_ctl_elem_info_get_max(ctl->info));
-			err = -1;
+			return false;
 		}
 
 		/* Only check step size if there is one and we're in bounds */
-		if (err >= 0 && snd_ctl_elem_info_get_step64(ctl->info) &&
+		if (snd_ctl_elem_info_get_step64(ctl->info) &&
 		    (int64_val - snd_ctl_elem_info_get_min64(ctl->info)) %
 		    snd_ctl_elem_info_get_step64(ctl->info)) {
-			ksft_print_msg("%s value %lld invalid for step %lld minimum %lld\n",
-				       ctl->name, int64_val,
+			ksft_print_msg("%s.%d value %lld invalid for step %lld minimum %lld\n",
+				       ctl->name, index, int64_val,
 				       snd_ctl_elem_info_get_step64(ctl->info),
 				       snd_ctl_elem_info_get_min64(ctl->info));
-			err = -1;
+			return false;
 		}
 		break;
 
 	default:
 		/* No tests for other types */
+		break;
+	}
+
+	return true;
+}
+
+/*
+ * Check that the provided value meets the constraints for the
+ * provided control.
+ */
+bool ctl_value_valid(struct ctl_data *ctl, snd_ctl_elem_value_t *val)
+{
+	int i;
+	bool valid = true;
+
+	for (i = 0; i < snd_ctl_elem_info_get_count(ctl->info); i++)
+		if (!ctl_value_index_valid(ctl, val, i))
+			valid = false;
+
+	return valid;
+}
+
+/*
+ * Check that we can read the default value and it is valid. Write
+ * tests use the read value to restore the default.
+ */
+void test_ctl_get_value(struct ctl_data *ctl)
+{
+	int err;
+
+	/* If the control is turned off let's be polite */
+	if (snd_ctl_elem_info_is_inactive(ctl->info)) {
+		ksft_print_msg("%s is inactive\n", ctl->name);
+		ksft_test_result_skip("get_value.%d.%d\n",
+				      ctl->card->card, ctl->elem);
+		return;
+	}
+
+	/* Can't test reading on an unreadable control */
+	if (!snd_ctl_elem_info_is_readable(ctl->info)) {
+		ksft_print_msg("%s is not readable\n", ctl->name);
 		ksft_test_result_skip("get_value.%d.%d\n",
 				      ctl->card->card, ctl->elem);
 		return;
 	}
 
+	err = snd_ctl_elem_read(ctl->card->handle, ctl->def_val);
+	if (err < 0) {
+		ksft_print_msg("snd_ctl_elem_read() failed: %s\n",
+			       snd_strerror(err));
+		goto out;
+	}
+
+	if (!ctl_value_valid(ctl, ctl->def_val))
+		err = -EINVAL;
+
 out:
 	ksft_test_result(err >= 0, "get_value.%d.%d\n",
 			 ctl->card->card, ctl->elem);
-- 
cgit v1.2.3


From 10f2f194663af178f32aeb4086fc3f6687d25056 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 17 Dec 2021 13:02:13 +0000
Subject: kselftest: alsa: Validate values read from enumerations

Enumerations should return a value between 0 and items-1, check that this
is the case.

Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Cezary Rojewski <cezary.rojewski@intel.com>
Link: https://lore.kernel.org/r/20211217130213.3893415-3-broonie@kernel.org
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 tools/testing/selftests/alsa/mixer-test.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/alsa/mixer-test.c b/tools/testing/selftests/alsa/mixer-test.c
index b009fc5df605..17f158d7a767 100644
--- a/tools/testing/selftests/alsa/mixer-test.c
+++ b/tools/testing/selftests/alsa/mixer-test.c
@@ -276,6 +276,23 @@ bool ctl_value_index_valid(struct ctl_data *ctl, snd_ctl_elem_value_t *val,
 		}
 		break;
 
+	case SND_CTL_ELEM_TYPE_ENUMERATED:
+		int_val = snd_ctl_elem_value_get_enumerated(val, index);
+
+		if (int_val < 0) {
+			ksft_print_msg("%s.%d negative value %ld for enumeration\n",
+				       ctl->name, index, int_val);
+			return false;
+		}
+
+		if (int_val >= snd_ctl_elem_info_get_items(ctl->info)) {
+			ksft_print_msg("%s.%d value %ld more than item count %ld\n",
+				       ctl->name, index, int_val,
+				       snd_ctl_elem_info_get_items(ctl->info));
+			return false;
+		}
+		break;
+
 	default:
 		/* No tests for other types */
 		break;
-- 
cgit v1.2.3


From e6007b85dfa284c4726c249e3c2fc4181ca8e179 Mon Sep 17 00:00:00 2001
From: Ma Xinjian <xinjianx.ma@intel.com>
Date: Fri, 24 Dec 2021 17:59:28 +0800
Subject: selftests: mptcp: Remove the deprecated config NFT_COUNTER

NFT_COUNTER was removed since
390ad4295aa ("netfilter: nf_tables: make counter support built-in")
LKP/0Day will check if all configs listing under selftests are able to
be enabled properly.

For the missing configs, it will report something like:
LKP WARN miss config CONFIG_NFT_COUNTER= of net/mptcp/config

- it's not reasonable to keep the deprecated configs.
- configs under kselftests are recommended by corresponding tests.
So if some configs are missing, it will impact the testing results

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Ma Xinjian <xinjianx.ma@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/mptcp/config | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/config b/tools/testing/selftests/net/mptcp/config
index 0faaccd21447..2b82628decb1 100644
--- a/tools/testing/selftests/net/mptcp/config
+++ b/tools/testing/selftests/net/mptcp/config
@@ -9,7 +9,6 @@ CONFIG_NETFILTER=y
 CONFIG_NETFILTER_ADVANCED=y
 CONFIG_NETFILTER_NETLINK=m
 CONFIG_NF_TABLES=m
-CONFIG_NFT_COUNTER=m
 CONFIG_NFT_COMPAT=m
 CONFIG_NETFILTER_XTABLES=m
 CONFIG_NETFILTER_XT_MATCH_BPF=m
-- 
cgit v1.2.3


From 53e8558837be58c1d44d50ad87247a8c56c95c13 Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Wed, 22 Dec 2021 12:32:34 +0800
Subject: ACPI: tools: Introduce utility for firmware updates/telemetry

Introduce a user space tool to make use of the interface exposed by
Platform Firmware Runtime Update and Telemetry drivers.

It can be used for firmware code injection, driver updates and
to retrieve platform firmware telemetry data.

Tested-by: Hongyu Ning <hongyu.ning@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 tools/power/acpi/.gitignore           |   1 +
 tools/power/acpi/Makefile             |  16 +-
 tools/power/acpi/Makefile.rules       |   2 +-
 tools/power/acpi/man/pfrut.8          | 137 +++++++++++
 tools/power/acpi/tools/pfrut/Makefile |  23 ++
 tools/power/acpi/tools/pfrut/pfrut.c  | 424 ++++++++++++++++++++++++++++++++++
 6 files changed, 594 insertions(+), 9 deletions(-)
 create mode 100644 tools/power/acpi/man/pfrut.8
 create mode 100644 tools/power/acpi/tools/pfrut/Makefile
 create mode 100644 tools/power/acpi/tools/pfrut/pfrut.c

(limited to 'tools')

diff --git a/tools/power/acpi/.gitignore b/tools/power/acpi/.gitignore
index 0b319fc8bb17..eada0297ef88 100644
--- a/tools/power/acpi/.gitignore
+++ b/tools/power/acpi/.gitignore
@@ -2,4 +2,5 @@
 /acpidbg
 /acpidump
 /ec
+/pfrut
 /include/
diff --git a/tools/power/acpi/Makefile b/tools/power/acpi/Makefile
index a249c50ebf55..5ff1d9c864d0 100644
--- a/tools/power/acpi/Makefile
+++ b/tools/power/acpi/Makefile
@@ -9,18 +9,18 @@ include ../../scripts/Makefile.include
 
 .NOTPARALLEL:
 
-all: acpidbg acpidump ec
-clean: acpidbg_clean acpidump_clean ec_clean
-install: acpidbg_install acpidump_install ec_install
-uninstall: acpidbg_uninstall acpidump_uninstall ec_uninstall
+all: acpidbg acpidump ec pfrut
+clean: acpidbg_clean acpidump_clean ec_clean pfrut_clean
+install: acpidbg_install acpidump_install ec_install pfrut_install
+uninstall: acpidbg_uninstall acpidump_uninstall ec_uninstall pfrut_uninstall
 
-acpidbg acpidump ec: FORCE
+acpidbg acpidump ec pfrut: FORCE
 	$(call descend,tools/$@,all)
-acpidbg_clean acpidump_clean ec_clean:
+acpidbg_clean acpidump_clean ec_clean pfrut_clean:
 	$(call descend,tools/$(@:_clean=),clean)
-acpidbg_install acpidump_install ec_install:
+acpidbg_install acpidump_install ec_install pfrut_install:
 	$(call descend,tools/$(@:_install=),install)
-acpidbg_uninstall acpidump_uninstall ec_uninstall:
+acpidbg_uninstall acpidump_uninstall ec_uninstall pfrut_uninstall:
 	$(call descend,tools/$(@:_uninstall=),uninstall)
 
 .PHONY: FORCE
diff --git a/tools/power/acpi/Makefile.rules b/tools/power/acpi/Makefile.rules
index 1d7616f5d0ae..b71aada77688 100644
--- a/tools/power/acpi/Makefile.rules
+++ b/tools/power/acpi/Makefile.rules
@@ -9,7 +9,7 @@ objdir := $(OUTPUT)tools/$(TOOL)/
 toolobjs := $(addprefix $(objdir),$(TOOL_OBJS))
 $(OUTPUT)$(TOOL): $(toolobjs) FORCE
 	$(ECHO) "  LD      " $(subst $(OUTPUT),,$@)
-	$(QUIET) $(LD) $(CFLAGS) $(LDFLAGS) $(toolobjs) -L$(OUTPUT) -o $@
+	$(QUIET) $(LD) $(CFLAGS) $(toolobjs) $(LDFLAGS) -L$(OUTPUT) -o $@
 	$(ECHO) "  STRIP   " $(subst $(OUTPUT),,$@)
 	$(QUIET) $(STRIPCMD) $@
 
diff --git a/tools/power/acpi/man/pfrut.8 b/tools/power/acpi/man/pfrut.8
new file mode 100644
index 000000000000..3db574770e8d
--- /dev/null
+++ b/tools/power/acpi/man/pfrut.8
@@ -0,0 +1,137 @@
+.TH "PFRUT" "8" "October 2021" "pfrut 1.0" ""
+.hy
+.SH Name
+.PP
+pfrut \- Platform Firmware Runtime Update and Telemetry tool
+.SH SYNOPSIS
+.PP
+\f[B]pfrut\f[R] [\f[I]Options\f[R]]
+.SH DESCRIPTION
+.PP
+The PFRUT(Platform Firmware Runtime Update and Telemetry) kernel interface is designed
+to
+.PD 0
+.P
+.PD
+interact with the platform firmware interface defined in the
+.PD 0
+.P
+.PD
+Management Mode Firmware Runtime
+Update (https://uefi.org/sites/default/files/resources/Intel_MM_OS_Interface_Spec_Rev100.pdf)
+.PD 0
+.P
+.PD
+\f[B]pfrut\f[R] is the tool to interact with the kernel interface.
+.PD 0
+.P
+.PD
+.SH OPTIONS
+.TP
+.B \f[B]\-h\f[R], \f[B]\-\-help\f[R]
+Display helper information.
+.TP
+.B \f[B]\-l\f[R], \f[B]\-\-load\f[R]
+Load the capsule file into the system.
+To be more specific, the capsule file will be copied to the
+communication buffer.
+.TP
+.B \f[B]\-s\f[R], \f[B]\-\-stage\f[R]
+Stage the capsule image from communication buffer into Management Mode
+and perform authentication.
+.TP
+.B \f[B]\-a\f[R], \f[B]\-\-activate\f[R]
+Activate a previous staged capsule image.
+.TP
+.B \f[B]\-u\f[R], \f[B]\-\-update\f[R]
+Perform both stage and activation actions.
+.TP
+.B \f[B]\-q\f[R], \f[B]\-\-query\f[R]
+Query the update capability.
+.TP
+.B \f[B]\-d\f[R], \f[B]\-\-setrev\f[R]
+Set the revision ID of code injection/driver update.
+.TP
+.B \f[B]\-D\f[R], \f[B]\-\-setrevlog\f[R]
+Set the revision ID of telemetry.
+.TP
+.B \f[B]\-G\f[R], \f[B]\-\-getloginfo\f[R]
+Get telemetry log information and print it out.
+.TP
+.B \f[B]\-T\f[R], \f[B]\-\-type\f[R]
+Set the telemetry log data type.
+.TP
+.B \f[B]\-L\f[R], \f[B]\-\-level\f[R]
+Set the telemetry log level.
+.TP
+.B \f[B]\-R\f[R], \f[B]\-\-read\f[R]
+Read all the telemetry data and print it out.
+.SH EXAMPLES
+.PP
+\f[B]pfrut \-G\f[R]
+.PP
+log_level:4
+.PD 0
+.P
+.PD
+log_type:0
+.PD 0
+.P
+.PD
+log_revid:2
+.PD 0
+.P
+.PD
+max_data_size:65536
+.PD 0
+.P
+.PD
+chunk1_size:0
+.PD 0
+.P
+.PD
+chunk2_size:1401
+.PD 0
+.P
+.PD
+rollover_cnt:0
+.PD 0
+.P
+.PD
+reset_cnt:4
+.PP
+\f[B]pfru \-q\f[R]
+.PP
+code injection image type:794bf8b2\-6e7b\-454e\-885f\-3fb9bb185402
+.PD 0
+.P
+.PD
+fw_version:0
+.PD 0
+.P
+.PD
+code_rt_version:1
+.PD 0
+.P
+.PD
+driver update image type:0e5f0b14\-f849\-7945\-ad81\-bc7b6d2bb245
+.PD 0
+.P
+.PD
+drv_rt_version:0
+.PD 0
+.P
+.PD
+drv_svn:0
+.PD 0
+.P
+.PD
+platform id:39214663\-b1a8\-4eaa\-9024\-f2bb53ea4723
+.PD 0
+.P
+.PD
+oem id:a36db54f\-ea2a\-e14e\-b7c4\-b5780e51ba3d
+.PP
+\f[B]pfrut \-l yours.cap \-u \-T 1 \-L 4\f[R]
+.SH AUTHORS
+Chen Yu.
diff --git a/tools/power/acpi/tools/pfrut/Makefile b/tools/power/acpi/tools/pfrut/Makefile
new file mode 100644
index 000000000000..61c1a96fd433
--- /dev/null
+++ b/tools/power/acpi/tools/pfrut/Makefile
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: GPL-2.0+
+
+include ../../Makefile.config
+
+TOOL = pfrut
+EXTRA_INSTALL = install-man
+EXTRA_UNINSTALL = uninstall-man
+
+CFLAGS += -Wall -O2
+CFLAGS += -DPFRUT_HEADER='"../../../../../include/uapi/linux/pfrut.h"'
+LDFLAGS += -luuid
+
+TOOL_OBJS = \
+	pfrut.o
+
+include ../../Makefile.rules
+
+install-man: $(srctree)/man/pfrut.8
+	$(ECHO) "  INST    " pfrut.8
+	$(QUIET) $(INSTALL_DATA) -D $< $(DESTDIR)$(mandir)/man8/pfrut.8
+uninstall-man:
+	$(ECHO) "  UNINST  " pfrut.8
+	$(QUIET) rm -f $(DESTDIR)$(mandir)/man8/pfrut.8
diff --git a/tools/power/acpi/tools/pfrut/pfrut.c b/tools/power/acpi/tools/pfrut/pfrut.c
new file mode 100644
index 000000000000..d79c335594b2
--- /dev/null
+++ b/tools/power/acpi/tools/pfrut/pfrut.c
@@ -0,0 +1,424 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Platform Firmware Runtime Update tool to do Management
+ * Mode code injection/driver update and telemetry retrieval.
+ *
+ * This tool uses the interfaces provided by pfr_update and
+ * pfr_telemetry drivers. These interfaces are exposed via
+ * /dev/pfr_update and /dev/pfr_telemetry. Write operation
+ * on the /dev/pfr_update is to load the EFI capsule into
+ * kernel space. Mmap/read operations on /dev/pfr_telemetry
+ * could be used to read the telemetry data to user space.
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <uuid/uuid.h>
+#include PFRUT_HEADER
+
+char *capsule_name;
+int action, query_cap, log_type, log_level, log_read, log_getinfo,
+	revid, log_revid;
+int set_log_level, set_log_type,
+	set_revid, set_log_revid;
+
+char *progname;
+
+#define LOG_ERR		0
+#define LOG_WARN	1
+#define LOG_INFO	2
+#define LOG_VERB	4
+#define LOG_EXEC_IDX	0
+#define LOG_HISTORY_IDX	1
+#define REVID_1		1
+#define REVID_2		2
+
+static int valid_log_level(int level)
+{
+	return level == LOG_ERR || level == LOG_WARN ||
+	       level == LOG_INFO || level == LOG_VERB;
+}
+
+static int valid_log_type(int type)
+{
+	return type == LOG_EXEC_IDX || type == LOG_HISTORY_IDX;
+}
+
+static inline int valid_log_revid(int id)
+{
+	return id == REVID_1 || id == REVID_2;
+}
+
+static void help(void)
+{
+	fprintf(stderr,
+		"usage: %s [OPTIONS]\n"
+		" code injection:\n"
+		"  -l, --load\n"
+		"  -s, --stage\n"
+		"  -a, --activate\n"
+		"  -u, --update [stage and activate]\n"
+		"  -q, --query\n"
+		"  -d, --revid update\n"
+		" telemetry:\n"
+		"  -G, --getloginfo\n"
+		"  -T, --type(0:execution, 1:history)\n"
+		"  -L, --level(0, 1, 2, 4)\n"
+		"  -R, --read\n"
+		"  -D, --revid log\n",
+		progname);
+}
+
+char *option_string = "l:sauqd:GT:L:RD:h";
+static struct option long_options[] = {
+	{"load", required_argument, 0, 'l'},
+	{"stage", no_argument, 0, 's'},
+	{"activate", no_argument, 0, 'a'},
+	{"update", no_argument, 0, 'u'},
+	{"query", no_argument, 0, 'q'},
+	{"getloginfo", no_argument, 0, 'G'},
+	{"type", required_argument, 0, 'T'},
+	{"level", required_argument, 0, 'L'},
+	{"read", no_argument, 0, 'R'},
+	{"setrev", required_argument, 0, 'd'},
+	{"setrevlog", required_argument, 0, 'D'},
+	{"help", no_argument, 0, 'h'},
+	{}
+};
+
+static void parse_options(int argc, char **argv)
+{
+	int option_index = 0;
+	char *pathname;
+	int opt;
+
+	pathname = strdup(argv[0]);
+	progname = basename(pathname);
+
+	while ((opt = getopt_long_only(argc, argv, option_string,
+				       long_options, &option_index)) != -1) {
+		switch (opt) {
+		case 'l':
+			capsule_name = optarg;
+			break;
+		case 's':
+			action = 1;
+			break;
+		case 'a':
+			action = 2;
+			break;
+		case 'u':
+			action = 3;
+			break;
+		case 'q':
+			query_cap = 1;
+			break;
+		case 'G':
+			log_getinfo = 1;
+			break;
+		case 'T':
+			log_type = atoi(optarg);
+			set_log_type = 1;
+			break;
+		case 'L':
+			log_level = atoi(optarg);
+			set_log_level = 1;
+			break;
+		case 'R':
+			log_read = 1;
+			break;
+		case 'd':
+			revid = atoi(optarg);
+			set_revid = 1;
+			break;
+		case 'D':
+			log_revid = atoi(optarg);
+			set_log_revid = 1;
+			break;
+		case 'h':
+			help();
+			exit(0);
+		default:
+			break;
+		}
+	}
+}
+
+void print_cap(struct pfru_update_cap_info *cap)
+{
+	char *uuid;
+
+	uuid = malloc(37);
+	if (!uuid) {
+		perror("Can not allocate uuid buffer\n");
+		exit(1);
+	}
+
+	uuid_unparse(cap->code_type, uuid);
+	printf("code injection image type:%s\n", uuid);
+	printf("fw_version:%d\n", cap->fw_version);
+	printf("code_rt_version:%d\n", cap->code_rt_version);
+
+	uuid_unparse(cap->drv_type, uuid);
+	printf("driver update image type:%s\n", uuid);
+	printf("drv_rt_version:%d\n", cap->drv_rt_version);
+	printf("drv_svn:%d\n", cap->drv_svn);
+
+	uuid_unparse(cap->platform_id, uuid);
+	printf("platform id:%s\n", uuid);
+	uuid_unparse(cap->oem_id, uuid);
+	printf("oem id:%s\n", uuid);
+	printf("oem information length:%d\n", cap->oem_info_len);
+
+	free(uuid);
+}
+
+int main(int argc, char *argv[])
+{
+	int fd_update, fd_update_log, fd_capsule;
+	struct pfrt_log_data_info data_info;
+	struct pfrt_log_info info;
+	struct pfru_update_cap_info cap;
+	void *addr_map_capsule;
+	struct stat st;
+	char *log_buf;
+	int ret = 0;
+
+	if (getuid() != 0) {
+		printf("Please run the tool as root - Exiting.\n");
+		return 1;
+	}
+
+	parse_options(argc, argv);
+
+	fd_update = open("/dev/acpi_pfr_update0", O_RDWR);
+	if (fd_update < 0) {
+		printf("PFRU device not supported - Quit...\n");
+		return 1;
+	}
+
+	fd_update_log = open("/dev/acpi_pfr_telemetry0", O_RDWR);
+	if (fd_update_log < 0) {
+		printf("PFRT device not supported - Quit...\n");
+		return 1;
+	}
+
+	if (query_cap) {
+		ret = ioctl(fd_update, PFRU_IOC_QUERY_CAP, &cap);
+		if (ret)
+			perror("Query Update Capability info failed.");
+		else
+			print_cap(&cap);
+
+		close(fd_update);
+		close(fd_update_log);
+
+		return ret;
+	}
+
+	if (log_getinfo) {
+		ret = ioctl(fd_update_log, PFRT_LOG_IOC_GET_DATA_INFO, &data_info);
+		if (ret) {
+			perror("Get telemetry data info failed.");
+			close(fd_update);
+			close(fd_update_log);
+
+			return 1;
+		}
+
+		ret = ioctl(fd_update_log, PFRT_LOG_IOC_GET_INFO, &info);
+		if (ret) {
+			perror("Get telemetry info failed.");
+			close(fd_update);
+			close(fd_update_log);
+
+			return 1;
+		}
+
+		printf("log_level:%d\n", info.log_level);
+		printf("log_type:%d\n", info.log_type);
+		printf("log_revid:%d\n", info.log_revid);
+		printf("max_data_size:%d\n", data_info.max_data_size);
+		printf("chunk1_size:%d\n", data_info.chunk1_size);
+		printf("chunk2_size:%d\n", data_info.chunk2_size);
+		printf("rollover_cnt:%d\n", data_info.rollover_cnt);
+		printf("reset_cnt:%d\n", data_info.reset_cnt);
+
+		return 0;
+	}
+
+	info.log_level = -1;
+	info.log_type = -1;
+	info.log_revid = -1;
+
+	if (set_log_level) {
+		if (!valid_log_level(log_level)) {
+			printf("Invalid log level %d\n",
+			       log_level);
+		} else {
+			info.log_level = log_level;
+		}
+	}
+
+	if (set_log_type) {
+		if (!valid_log_type(log_type)) {
+			printf("Invalid log type %d\n",
+			       log_type);
+		} else {
+			info.log_type = log_type;
+		}
+	}
+
+	if (set_log_revid) {
+		if (!valid_log_revid(log_revid)) {
+			printf("Invalid log revid %d, unchanged.\n",
+			       log_revid);
+		} else {
+			info.log_revid = log_revid;
+		}
+	}
+
+	ret = ioctl(fd_update_log, PFRT_LOG_IOC_SET_INFO, &info);
+	if (ret) {
+		perror("Log information set failed.(log_level, log_type, log_revid)");
+		close(fd_update);
+		close(fd_update_log);
+
+		return 1;
+	}
+
+	if (set_revid) {
+		ret = ioctl(fd_update, PFRU_IOC_SET_REV, &revid);
+		if (ret) {
+			perror("pfru update revid set failed");
+			close(fd_update);
+			close(fd_update_log);
+
+			return 1;
+		}
+
+		printf("pfru update revid set to %d\n", revid);
+	}
+
+	if (capsule_name) {
+		fd_capsule = open(capsule_name, O_RDONLY);
+		if (fd_capsule < 0) {
+			perror("Can not open capsule file...");
+			close(fd_update);
+			close(fd_update_log);
+
+			return 1;
+		}
+
+		if (fstat(fd_capsule, &st) < 0) {
+			perror("Can not fstat capsule file...");
+			close(fd_capsule);
+			close(fd_update);
+			close(fd_update_log);
+
+			return 1;
+		}
+
+		addr_map_capsule = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED,
+					fd_capsule, 0);
+		if (addr_map_capsule == MAP_FAILED) {
+			perror("Failed to mmap capsule file.");
+			close(fd_capsule);
+			close(fd_update);
+			close(fd_update_log);
+
+			return 1;
+		}
+
+		ret = write(fd_update, (char *)addr_map_capsule, st.st_size);
+		printf("Load %d bytes of capsule file into the system\n",
+		       ret);
+
+		if (ret == -1) {
+			perror("Failed to load capsule file");
+			close(fd_capsule);
+			close(fd_update);
+			close(fd_update_log);
+
+			return 1;
+		}
+
+		munmap(addr_map_capsule, st.st_size);
+		close(fd_capsule);
+		printf("Load done.\n");
+	}
+
+	if (action) {
+		if (action == 1) {
+			ret = ioctl(fd_update, PFRU_IOC_STAGE, NULL);
+		} else if (action == 2) {
+			ret = ioctl(fd_update, PFRU_IOC_ACTIVATE, NULL);
+		} else if (action == 3) {
+			ret = ioctl(fd_update, PFRU_IOC_STAGE_ACTIVATE, NULL);
+		} else {
+			close(fd_update);
+			close(fd_update_log);
+
+			return 1;
+		}
+		printf("Update finished, return %d\n", ret);
+	}
+
+	close(fd_update);
+
+	if (log_read) {
+		void *p_mmap;
+		int max_data_sz;
+
+		ret = ioctl(fd_update_log, PFRT_LOG_IOC_GET_DATA_INFO, &data_info);
+		if (ret) {
+			perror("Get telemetry data info failed.");
+			close(fd_update_log);
+
+			return 1;
+		}
+
+		max_data_sz = data_info.max_data_size;
+		if (!max_data_sz) {
+			printf("No telemetry data available.\n");
+			close(fd_update_log);
+
+			return 1;
+		}
+
+		log_buf = malloc(max_data_sz + 1);
+		if (!log_buf) {
+			perror("log_buf allocate failed.");
+			close(fd_update_log);
+
+			return 1;
+		}
+
+		p_mmap = mmap(NULL, max_data_sz, PROT_READ, MAP_SHARED, fd_update_log, 0);
+		if (p_mmap == MAP_FAILED) {
+			perror("mmap error.");
+			close(fd_update_log);
+
+			return 1;
+		}
+
+		memcpy(log_buf, p_mmap, max_data_sz);
+		log_buf[max_data_sz] = '\0';
+		printf("%s\n", log_buf);
+		free(log_buf);
+
+		munmap(p_mmap, max_data_sz);
+	}
+
+	close(fd_update_log);
+
+	return 0;
+}
-- 
cgit v1.2.3


From cb7c4f364abd09abd1865fa049ef492fb43e6bf3 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 27 Dec 2021 12:48:04 +0000
Subject: KVM: selftests: arm64: Initialise default guest mode at test startup
 time

As we are going to add support for a variable default mode on arm64,
let's make sure it is setup first by using a constructor that gets
called before the actual test runs.

Suggested-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Andrew Jones <drjones@redhat.com>
Link: https://lore.kernel.org/r/20211227124809.1335409-2-maz@kernel.org
---
 tools/testing/selftests/kvm/lib/aarch64/processor.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c
index b4eeeafd2a70..b509341b8411 100644
--- a/tools/testing/selftests/kvm/lib/aarch64/processor.c
+++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c
@@ -432,3 +432,12 @@ uint32_t guest_get_vcpuid(void)
 {
 	return read_sysreg(tpidr_el1);
 }
+
+/*
+ * arm64 doesn't have a true default mode, so start by computing the
+ * available IPA space and page sizes early.
+ */
+void __attribute__((constructor)) init_guest_modes(void)
+{
+       guest_modes_append_default();
+}
-- 
cgit v1.2.3


From 357c628e1248dd53f5c43a768246a83478a7f489 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 27 Dec 2021 12:48:05 +0000
Subject: KVM: selftests: arm64: Introduce a variable default IPA size

Contrary to popular belief, there is no such thing as a default
IPA size on arm64. Anything goes, and implementations are the
usual Wild West.

The selftest infrastructure default to 40bit IPA, which obviously
doesn't work for some systems out there.

Turn VM_MODE_DEFAULT from a constant into a variable, and let
guest_modes_append_default() populate it, depending on what
the HW can do. In order to preserve the current behaviour, we
still pick 40bits IPA as the default if it is available, and
the largest supported IPA space otherwise.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Andrew Jones <drjones@redhat.com>
Link: https://lore.kernel.org/r/20211227124809.1335409-3-maz@kernel.org
---
 tools/testing/selftests/kvm/include/kvm_util.h |  4 +++-
 tools/testing/selftests/kvm/lib/guest_modes.c  | 30 +++++++++++++++++++++++---
 2 files changed, 30 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 6a1a37f30494..ec070d800731 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -53,7 +53,9 @@ enum vm_guest_mode {
 
 #if defined(__aarch64__)
 
-#define VM_MODE_DEFAULT			VM_MODE_P40V48_4K
+extern enum vm_guest_mode vm_mode_default;
+
+#define VM_MODE_DEFAULT			vm_mode_default
 #define MIN_PAGE_SHIFT			12U
 #define ptes_per_page(page_size)	((page_size) / 8)
 
diff --git a/tools/testing/selftests/kvm/lib/guest_modes.c b/tools/testing/selftests/kvm/lib/guest_modes.c
index c330f414ef96..5e3fdbd992fd 100644
--- a/tools/testing/selftests/kvm/lib/guest_modes.c
+++ b/tools/testing/selftests/kvm/lib/guest_modes.c
@@ -4,22 +4,46 @@
  */
 #include "guest_modes.h"
 
+#ifdef __aarch64__
+enum vm_guest_mode vm_mode_default;
+#endif
+
 struct guest_mode guest_modes[NUM_VM_MODES];
 
 void guest_modes_append_default(void)
 {
+#ifndef __aarch64__
 	guest_mode_append(VM_MODE_DEFAULT, true, true);
-
-#ifdef __aarch64__
-	guest_mode_append(VM_MODE_P40V48_64K, true, true);
+#else
 	{
 		unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE);
+		int i;
+
+		vm_mode_default = NUM_VM_MODES;
+
 		if (limit >= 52)
 			guest_mode_append(VM_MODE_P52V48_64K, true, true);
 		if (limit >= 48) {
 			guest_mode_append(VM_MODE_P48V48_4K, true, true);
 			guest_mode_append(VM_MODE_P48V48_64K, true, true);
 		}
+		if (limit >= 40) {
+			guest_mode_append(VM_MODE_P40V48_4K, true, true);
+			guest_mode_append(VM_MODE_P40V48_64K, true, true);
+			vm_mode_default = VM_MODE_P40V48_4K;
+		}
+
+		/*
+		 * Pick the first supported IPA size if the default
+		 * isn't available.
+		 */
+		for (i = 0; vm_mode_default == NUM_VM_MODES && i < NUM_VM_MODES; i++) {
+			if (guest_modes[i].supported && guest_modes[i].enabled)
+				vm_mode_default = i;
+		}
+
+		TEST_ASSERT(vm_mode_default != NUM_VM_MODES,
+			    "No supported mode!");
 	}
 #endif
 #ifdef __s390x__
-- 
cgit v1.2.3


From 0303ffdb9ecffac4654b16bbf69ba84d131eb8b7 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 27 Dec 2021 12:48:06 +0000
Subject: KVM: selftests: arm64: Check for supported page sizes

Just as arm64 implemenations don't necessary support all IPA
ranges, they don't  all support the same page sizes either. Fun.

Create a dummy VM to snapshot the page sizes supported by the
host, and filter the supported modes.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Andrew Jones <drjones@redhat.com>
Link: https://lore.kernel.org/r/20211227124809.1335409-4-maz@kernel.org
---
 .../selftests/kvm/include/aarch64/processor.h      |  3 ++
 .../testing/selftests/kvm/lib/aarch64/processor.c  | 36 ++++++++++++++++++++++
 tools/testing/selftests/kvm/lib/guest_modes.c      | 17 ++++++----
 3 files changed, 50 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h
index 27d8e1bb5b36..8f9f46979a00 100644
--- a/tools/testing/selftests/kvm/include/aarch64/processor.h
+++ b/tools/testing/selftests/kvm/include/aarch64/processor.h
@@ -113,6 +113,9 @@ enum {
 #define ESR_EC_WP_CURRENT	0x35
 #define ESR_EC_BRK_INS		0x3c
 
+void aarch64_get_supported_page_sizes(uint32_t ipa,
+				      bool *ps4k, bool *ps16k, bool *ps64k);
+
 void vm_init_descriptor_tables(struct kvm_vm *vm);
 void vcpu_init_descriptor_tables(struct kvm_vm *vm, uint32_t vcpuid);
 
diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c
index b509341b8411..49fcfe9768e0 100644
--- a/tools/testing/selftests/kvm/lib/aarch64/processor.c
+++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c
@@ -8,6 +8,7 @@
 #include <linux/compiler.h>
 #include <assert.h>
 
+#include "guest_modes.h"
 #include "kvm_util.h"
 #include "../kvm_util_internal.h"
 #include "processor.h"
@@ -433,6 +434,41 @@ uint32_t guest_get_vcpuid(void)
 	return read_sysreg(tpidr_el1);
 }
 
+void aarch64_get_supported_page_sizes(uint32_t ipa,
+				      bool *ps4k, bool *ps16k, bool *ps64k)
+{
+	struct kvm_vcpu_init preferred_init;
+	int kvm_fd, vm_fd, vcpu_fd, err;
+	uint64_t val;
+	struct kvm_one_reg reg = {
+		.id	= KVM_ARM64_SYS_REG(SYS_ID_AA64MMFR0_EL1),
+		.addr	= (uint64_t)&val,
+	};
+
+	kvm_fd = open_kvm_dev_path_or_exit();
+	vm_fd = ioctl(kvm_fd, KVM_CREATE_VM, ipa);
+	TEST_ASSERT(vm_fd >= 0, "Can't create VM");
+
+	vcpu_fd = ioctl(vm_fd, KVM_CREATE_VCPU, 0);
+	TEST_ASSERT(vcpu_fd >= 0, "Can't create vcpu");
+
+	err = ioctl(vm_fd, KVM_ARM_PREFERRED_TARGET, &preferred_init);
+	TEST_ASSERT(err == 0, "Can't get target");
+	err = ioctl(vcpu_fd, KVM_ARM_VCPU_INIT, &preferred_init);
+	TEST_ASSERT(err == 0, "Can't get init vcpu");
+
+	err = ioctl(vcpu_fd, KVM_GET_ONE_REG, &reg);
+	TEST_ASSERT(err == 0, "Can't get MMFR0");
+
+	*ps4k = ((val >> 28) & 0xf) != 0xf;
+	*ps64k = ((val >> 24) & 0xf) == 0;
+	*ps16k = ((val >> 20) & 0xf) != 0;
+
+	close(vcpu_fd);
+	close(vm_fd);
+	close(kvm_fd);
+}
+
 /*
  * arm64 doesn't have a true default mode, so start by computing the
  * available IPA space and page sizes early.
diff --git a/tools/testing/selftests/kvm/lib/guest_modes.c b/tools/testing/selftests/kvm/lib/guest_modes.c
index 5e3fdbd992fd..67144fdac433 100644
--- a/tools/testing/selftests/kvm/lib/guest_modes.c
+++ b/tools/testing/selftests/kvm/lib/guest_modes.c
@@ -5,6 +5,7 @@
 #include "guest_modes.h"
 
 #ifdef __aarch64__
+#include "processor.h"
 enum vm_guest_mode vm_mode_default;
 #endif
 
@@ -17,20 +18,24 @@ void guest_modes_append_default(void)
 #else
 	{
 		unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE);
+		bool ps4k, ps16k, ps64k;
 		int i;
 
+		aarch64_get_supported_page_sizes(limit, &ps4k, &ps16k, &ps64k);
+
 		vm_mode_default = NUM_VM_MODES;
 
 		if (limit >= 52)
-			guest_mode_append(VM_MODE_P52V48_64K, true, true);
+			guest_mode_append(VM_MODE_P52V48_64K, ps64k, ps64k);
 		if (limit >= 48) {
-			guest_mode_append(VM_MODE_P48V48_4K, true, true);
-			guest_mode_append(VM_MODE_P48V48_64K, true, true);
+			guest_mode_append(VM_MODE_P48V48_4K, ps4k, ps4k);
+			guest_mode_append(VM_MODE_P48V48_64K, ps64k, ps64k);
 		}
 		if (limit >= 40) {
-			guest_mode_append(VM_MODE_P40V48_4K, true, true);
-			guest_mode_append(VM_MODE_P40V48_64K, true, true);
-			vm_mode_default = VM_MODE_P40V48_4K;
+			guest_mode_append(VM_MODE_P40V48_4K, ps4k, ps4k);
+			guest_mode_append(VM_MODE_P40V48_64K, ps64k, ps64k);
+			if (ps4k)
+				vm_mode_default = VM_MODE_P40V48_4K;
 		}
 
 		/*
-- 
cgit v1.2.3


From 2f41a61c54fb6410202b2cc08be80ae9554d599d Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 27 Dec 2021 12:48:07 +0000
Subject: KVM: selftests: arm64: Rework TCR_EL1 configuration

The current way we initialise TCR_EL1 is a bit cumbersome, as
we mix setting TG0 and IPS in the same swtch statement.

Split it into two statements (one for the base granule size, and
another for the IPA size), allowing new modes to be added in a
more elegant way.

No functional change intended.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Andrew Jones <drjones@redhat.com>
Link: https://lore.kernel.org/r/20211227124809.1335409-5-maz@kernel.org
---
 tools/testing/selftests/kvm/lib/aarch64/processor.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c
index 49fcfe9768e0..d005543aa3e2 100644
--- a/tools/testing/selftests/kvm/lib/aarch64/processor.c
+++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c
@@ -238,6 +238,7 @@ void aarch64_vcpu_setup(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_vcpu_init
 	get_reg(vm, vcpuid, KVM_ARM64_SYS_REG(SYS_SCTLR_EL1), &sctlr_el1);
 	get_reg(vm, vcpuid, KVM_ARM64_SYS_REG(SYS_TCR_EL1), &tcr_el1);
 
+	/* Configure base granule size */
 	switch (vm->mode) {
 	case VM_MODE_P52V48_4K:
 		TEST_FAIL("AArch64 does not support 4K sized pages "
@@ -246,23 +247,29 @@ void aarch64_vcpu_setup(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_vcpu_init
 		TEST_FAIL("AArch64 does not support 4K sized pages "
 			  "with ANY-bit physical address ranges");
 	case VM_MODE_P52V48_64K:
+	case VM_MODE_P48V48_64K:
+	case VM_MODE_P40V48_64K:
 		tcr_el1 |= 1ul << 14; /* TG0 = 64KB */
-		tcr_el1 |= 6ul << 32; /* IPS = 52 bits */
 		break;
 	case VM_MODE_P48V48_4K:
+	case VM_MODE_P40V48_4K:
 		tcr_el1 |= 0ul << 14; /* TG0 = 4KB */
-		tcr_el1 |= 5ul << 32; /* IPS = 48 bits */
 		break;
+	default:
+		TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode);
+	}
+
+	/* Configure output size */
+	switch (vm->mode) {
+	case VM_MODE_P52V48_64K:
+		tcr_el1 |= 6ul << 32; /* IPS = 52 bits */
+		break;
+	case VM_MODE_P48V48_4K:
 	case VM_MODE_P48V48_64K:
-		tcr_el1 |= 1ul << 14; /* TG0 = 64KB */
 		tcr_el1 |= 5ul << 32; /* IPS = 48 bits */
 		break;
 	case VM_MODE_P40V48_4K:
-		tcr_el1 |= 0ul << 14; /* TG0 = 4KB */
-		tcr_el1 |= 2ul << 32; /* IPS = 40 bits */
-		break;
 	case VM_MODE_P40V48_64K:
-		tcr_el1 |= 1ul << 14; /* TG0 = 64KB */
 		tcr_el1 |= 2ul << 32; /* IPS = 40 bits */
 		break;
 	default:
-- 
cgit v1.2.3


From e7f58a6bd28bfd2e4f60312abf48f07de2c4121c Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 27 Dec 2021 12:48:08 +0000
Subject: KVM: selftests: arm64: Add support for VM_MODE_P36V48_{4K,64K}

Some of the arm64 systems out there have an IPA space that is
positively tiny. Nonetheless, they make great KVM hosts.

Add support for 36bit IPA support with 4kB pages, which makes
some of the fruity machines happy. Whilst we're at it, add support
for 64kB pages as well, though these boxes have no support for it.

Reviewed-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20211227124809.1335409-6-maz@kernel.org
---
 tools/testing/selftests/kvm/include/kvm_util.h      | 2 ++
 tools/testing/selftests/kvm/lib/aarch64/processor.c | 6 ++++++
 tools/testing/selftests/kvm/lib/guest_modes.c       | 4 ++++
 tools/testing/selftests/kvm/lib/kvm_util.c          | 6 ++++++
 4 files changed, 18 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index ec070d800731..4e9e01631bc3 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -48,6 +48,8 @@ enum vm_guest_mode {
 	VM_MODE_PXXV48_4K,	/* For 48bits VA but ANY bits PA */
 	VM_MODE_P47V64_4K,
 	VM_MODE_P44V64_4K,
+	VM_MODE_P36V48_4K,
+	VM_MODE_P36V48_64K,
 	NUM_VM_MODES,
 };
 
diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c
index d005543aa3e2..70395c777ea4 100644
--- a/tools/testing/selftests/kvm/lib/aarch64/processor.c
+++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c
@@ -249,10 +249,12 @@ void aarch64_vcpu_setup(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_vcpu_init
 	case VM_MODE_P52V48_64K:
 	case VM_MODE_P48V48_64K:
 	case VM_MODE_P40V48_64K:
+	case VM_MODE_P36V48_64K:
 		tcr_el1 |= 1ul << 14; /* TG0 = 64KB */
 		break;
 	case VM_MODE_P48V48_4K:
 	case VM_MODE_P40V48_4K:
+	case VM_MODE_P36V48_4K:
 		tcr_el1 |= 0ul << 14; /* TG0 = 4KB */
 		break;
 	default:
@@ -272,6 +274,10 @@ void aarch64_vcpu_setup(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_vcpu_init
 	case VM_MODE_P40V48_64K:
 		tcr_el1 |= 2ul << 32; /* IPS = 40 bits */
 		break;
+	case VM_MODE_P36V48_4K:
+	case VM_MODE_P36V48_64K:
+		tcr_el1 |= 1ul << 32; /* IPS = 36 bits */
+		break;
 	default:
 		TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode);
 	}
diff --git a/tools/testing/selftests/kvm/lib/guest_modes.c b/tools/testing/selftests/kvm/lib/guest_modes.c
index 67144fdac433..240f2d2e2d23 100644
--- a/tools/testing/selftests/kvm/lib/guest_modes.c
+++ b/tools/testing/selftests/kvm/lib/guest_modes.c
@@ -37,6 +37,10 @@ void guest_modes_append_default(void)
 			if (ps4k)
 				vm_mode_default = VM_MODE_P40V48_4K;
 		}
+		if (limit >= 36) {
+			guest_mode_append(VM_MODE_P36V48_4K, ps4k, ps4k);
+			guest_mode_append(VM_MODE_P36V48_64K, ps64k, ps64k);
+		}
 
 		/*
 		 * Pick the first supported IPA size if the default
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 8f2e0bb1ef96..643cb2e9a653 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -172,6 +172,8 @@ const char *vm_guest_mode_string(uint32_t i)
 		[VM_MODE_PXXV48_4K]	= "PA-bits:ANY, VA-bits:48,  4K pages",
 		[VM_MODE_P47V64_4K]	= "PA-bits:47,  VA-bits:64,  4K pages",
 		[VM_MODE_P44V64_4K]	= "PA-bits:44,  VA-bits:64,  4K pages",
+		[VM_MODE_P36V48_4K]	= "PA-bits:36,  VA-bits:48,  4K pages",
+		[VM_MODE_P36V48_64K]	= "PA-bits:36,  VA-bits:48, 64K pages",
 	};
 	_Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES,
 		       "Missing new mode strings?");
@@ -191,6 +193,8 @@ const struct vm_guest_mode_params vm_guest_mode_params[] = {
 	[VM_MODE_PXXV48_4K]	= {  0,  0,  0x1000, 12 },
 	[VM_MODE_P47V64_4K]	= { 47, 64,  0x1000, 12 },
 	[VM_MODE_P44V64_4K]	= { 44, 64,  0x1000, 12 },
+	[VM_MODE_P36V48_4K]	= { 36, 48,  0x1000, 12 },
+	[VM_MODE_P36V48_64K]	= { 36, 48, 0x10000, 16 },
 };
 _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES,
 	       "Missing new mode params?");
@@ -252,9 +256,11 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
 		vm->pgtable_levels = 3;
 		break;
 	case VM_MODE_P40V48_4K:
+	case VM_MODE_P36V48_4K:
 		vm->pgtable_levels = 4;
 		break;
 	case VM_MODE_P40V48_64K:
+	case VM_MODE_P36V48_64K:
 		vm->pgtable_levels = 3;
 		break;
 	case VM_MODE_PXXV48_4K:
-- 
cgit v1.2.3


From aa674de1dc3d2bdf2c67ad195dc81977972323c6 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 27 Dec 2021 12:48:09 +0000
Subject: KVM: selftests: arm64: Add support for various modes with 16kB page
 size

The 16kB page size is not a popular choice, due to only a few CPUs
actually implementing support for it. However, it can lead to some
interesting performance improvements given the right uarch choices.

Add support for this page size for various PA/VA combinations.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Andrew Jones <drjones@redhat.com>
Link: https://lore.kernel.org/r/20211227124809.1335409-7-maz@kernel.org
---
 tools/testing/selftests/kvm/include/kvm_util.h      |  4 ++++
 tools/testing/selftests/kvm/lib/aarch64/processor.c | 10 ++++++++++
 tools/testing/selftests/kvm/lib/guest_modes.c       |  4 ++++
 tools/testing/selftests/kvm/lib/kvm_util.c          | 16 ++++++++++++++++
 4 files changed, 34 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 4e9e01631bc3..4fee66da597c 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -42,14 +42,18 @@ enum vm_guest_mode {
 	VM_MODE_P52V48_4K,
 	VM_MODE_P52V48_64K,
 	VM_MODE_P48V48_4K,
+	VM_MODE_P48V48_16K,
 	VM_MODE_P48V48_64K,
 	VM_MODE_P40V48_4K,
+	VM_MODE_P40V48_16K,
 	VM_MODE_P40V48_64K,
 	VM_MODE_PXXV48_4K,	/* For 48bits VA but ANY bits PA */
 	VM_MODE_P47V64_4K,
 	VM_MODE_P44V64_4K,
 	VM_MODE_P36V48_4K,
+	VM_MODE_P36V48_16K,
 	VM_MODE_P36V48_64K,
+	VM_MODE_P36V47_16K,
 	NUM_VM_MODES,
 };
 
diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c
index 70395c777ea4..9343d82519b4 100644
--- a/tools/testing/selftests/kvm/lib/aarch64/processor.c
+++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c
@@ -252,6 +252,12 @@ void aarch64_vcpu_setup(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_vcpu_init
 	case VM_MODE_P36V48_64K:
 		tcr_el1 |= 1ul << 14; /* TG0 = 64KB */
 		break;
+	case VM_MODE_P48V48_16K:
+	case VM_MODE_P40V48_16K:
+	case VM_MODE_P36V48_16K:
+	case VM_MODE_P36V47_16K:
+		tcr_el1 |= 2ul << 14; /* TG0 = 16KB */
+		break;
 	case VM_MODE_P48V48_4K:
 	case VM_MODE_P40V48_4K:
 	case VM_MODE_P36V48_4K:
@@ -267,15 +273,19 @@ void aarch64_vcpu_setup(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_vcpu_init
 		tcr_el1 |= 6ul << 32; /* IPS = 52 bits */
 		break;
 	case VM_MODE_P48V48_4K:
+	case VM_MODE_P48V48_16K:
 	case VM_MODE_P48V48_64K:
 		tcr_el1 |= 5ul << 32; /* IPS = 48 bits */
 		break;
 	case VM_MODE_P40V48_4K:
+	case VM_MODE_P40V48_16K:
 	case VM_MODE_P40V48_64K:
 		tcr_el1 |= 2ul << 32; /* IPS = 40 bits */
 		break;
 	case VM_MODE_P36V48_4K:
+	case VM_MODE_P36V48_16K:
 	case VM_MODE_P36V48_64K:
+	case VM_MODE_P36V47_16K:
 		tcr_el1 |= 1ul << 32; /* IPS = 36 bits */
 		break;
 	default:
diff --git a/tools/testing/selftests/kvm/lib/guest_modes.c b/tools/testing/selftests/kvm/lib/guest_modes.c
index 240f2d2e2d23..da315d3373a3 100644
--- a/tools/testing/selftests/kvm/lib/guest_modes.c
+++ b/tools/testing/selftests/kvm/lib/guest_modes.c
@@ -29,17 +29,21 @@ void guest_modes_append_default(void)
 			guest_mode_append(VM_MODE_P52V48_64K, ps64k, ps64k);
 		if (limit >= 48) {
 			guest_mode_append(VM_MODE_P48V48_4K, ps4k, ps4k);
+			guest_mode_append(VM_MODE_P48V48_16K, ps16k, ps16k);
 			guest_mode_append(VM_MODE_P48V48_64K, ps64k, ps64k);
 		}
 		if (limit >= 40) {
 			guest_mode_append(VM_MODE_P40V48_4K, ps4k, ps4k);
+			guest_mode_append(VM_MODE_P40V48_16K, ps16k, ps16k);
 			guest_mode_append(VM_MODE_P40V48_64K, ps64k, ps64k);
 			if (ps4k)
 				vm_mode_default = VM_MODE_P40V48_4K;
 		}
 		if (limit >= 36) {
 			guest_mode_append(VM_MODE_P36V48_4K, ps4k, ps4k);
+			guest_mode_append(VM_MODE_P36V48_16K, ps16k, ps16k);
 			guest_mode_append(VM_MODE_P36V48_64K, ps64k, ps64k);
+			guest_mode_append(VM_MODE_P36V47_16K, ps16k, ps16k);
 		}
 
 		/*
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 643cb2e9a653..6c8766c9c166 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -166,14 +166,18 @@ const char *vm_guest_mode_string(uint32_t i)
 		[VM_MODE_P52V48_4K]	= "PA-bits:52,  VA-bits:48,  4K pages",
 		[VM_MODE_P52V48_64K]	= "PA-bits:52,  VA-bits:48, 64K pages",
 		[VM_MODE_P48V48_4K]	= "PA-bits:48,  VA-bits:48,  4K pages",
+		[VM_MODE_P48V48_16K]	= "PA-bits:48,  VA-bits:48, 16K pages",
 		[VM_MODE_P48V48_64K]	= "PA-bits:48,  VA-bits:48, 64K pages",
 		[VM_MODE_P40V48_4K]	= "PA-bits:40,  VA-bits:48,  4K pages",
+		[VM_MODE_P40V48_16K]	= "PA-bits:40,  VA-bits:48, 16K pages",
 		[VM_MODE_P40V48_64K]	= "PA-bits:40,  VA-bits:48, 64K pages",
 		[VM_MODE_PXXV48_4K]	= "PA-bits:ANY, VA-bits:48,  4K pages",
 		[VM_MODE_P47V64_4K]	= "PA-bits:47,  VA-bits:64,  4K pages",
 		[VM_MODE_P44V64_4K]	= "PA-bits:44,  VA-bits:64,  4K pages",
 		[VM_MODE_P36V48_4K]	= "PA-bits:36,  VA-bits:48,  4K pages",
+		[VM_MODE_P36V48_16K]	= "PA-bits:36,  VA-bits:48, 16K pages",
 		[VM_MODE_P36V48_64K]	= "PA-bits:36,  VA-bits:48, 64K pages",
+		[VM_MODE_P36V47_16K]	= "PA-bits:36,  VA-bits:47, 16K pages",
 	};
 	_Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES,
 		       "Missing new mode strings?");
@@ -187,14 +191,18 @@ const struct vm_guest_mode_params vm_guest_mode_params[] = {
 	[VM_MODE_P52V48_4K]	= { 52, 48,  0x1000, 12 },
 	[VM_MODE_P52V48_64K]	= { 52, 48, 0x10000, 16 },
 	[VM_MODE_P48V48_4K]	= { 48, 48,  0x1000, 12 },
+	[VM_MODE_P48V48_16K]	= { 48, 48,  0x4000, 14 },
 	[VM_MODE_P48V48_64K]	= { 48, 48, 0x10000, 16 },
 	[VM_MODE_P40V48_4K]	= { 40, 48,  0x1000, 12 },
+	[VM_MODE_P40V48_16K]	= { 40, 48,  0x4000, 14 },
 	[VM_MODE_P40V48_64K]	= { 40, 48, 0x10000, 16 },
 	[VM_MODE_PXXV48_4K]	= {  0,  0,  0x1000, 12 },
 	[VM_MODE_P47V64_4K]	= { 47, 64,  0x1000, 12 },
 	[VM_MODE_P44V64_4K]	= { 44, 64,  0x1000, 12 },
 	[VM_MODE_P36V48_4K]	= { 36, 48,  0x1000, 12 },
+	[VM_MODE_P36V48_16K]	= { 36, 48,  0x4000, 14 },
 	[VM_MODE_P36V48_64K]	= { 36, 48, 0x10000, 16 },
+	[VM_MODE_P36V47_16K]	= { 36, 47,  0x4000, 14 },
 };
 _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES,
 	       "Missing new mode params?");
@@ -263,6 +271,14 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
 	case VM_MODE_P36V48_64K:
 		vm->pgtable_levels = 3;
 		break;
+	case VM_MODE_P48V48_16K:
+	case VM_MODE_P40V48_16K:
+	case VM_MODE_P36V48_16K:
+		vm->pgtable_levels = 4;
+		break;
+	case VM_MODE_P36V47_16K:
+		vm->pgtable_levels = 3;
+		break;
 	case VM_MODE_PXXV48_4K:
 #ifdef __x86_64__
 		kvm_get_cpu_address_width(&vm->pa_bits, &vm->va_bits);
-- 
cgit v1.2.3


From 33a1ca736e74839d08948973d30f6def820b8b14 Mon Sep 17 00:00:00 2001
From: Ricardo Koller <ricarkol@google.com>
Date: Mon, 8 Nov 2021 18:38:50 -0800
Subject: KVM: selftests: aarch64: Move gic_v3.h to shared headers

Move gic_v3.h to the shared headers location. There are some definitions
that will be used in the vgic-irq test.

Signed-off-by: Ricardo Koller <ricarkol@google.com>
Acked-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20211109023906.1091208-2-ricarkol@google.com
---
 .../testing/selftests/kvm/include/aarch64/gic_v3.h | 70 ++++++++++++++++++++++
 tools/testing/selftests/kvm/lib/aarch64/gic_v3.h   | 70 ----------------------
 2 files changed, 70 insertions(+), 70 deletions(-)
 create mode 100644 tools/testing/selftests/kvm/include/aarch64/gic_v3.h
 delete mode 100644 tools/testing/selftests/kvm/lib/aarch64/gic_v3.h

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/aarch64/gic_v3.h b/tools/testing/selftests/kvm/include/aarch64/gic_v3.h
new file mode 100644
index 000000000000..b51536d469a6
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/aarch64/gic_v3.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * ARM Generic Interrupt Controller (GIC) v3 specific defines
+ */
+
+#ifndef SELFTEST_KVM_GICV3_H
+#define SELFTEST_KVM_GICV3_H
+
+#include <asm/sysreg.h>
+
+/*
+ * Distributor registers
+ */
+#define GICD_CTLR			0x0000
+#define GICD_TYPER			0x0004
+#define GICD_IGROUPR			0x0080
+#define GICD_ISENABLER			0x0100
+#define GICD_ICENABLER			0x0180
+#define GICD_ICACTIVER			0x0380
+#define GICD_IPRIORITYR			0x0400
+
+/*
+ * The assumption is that the guest runs in a non-secure mode.
+ * The following bits of GICD_CTLR are defined accordingly.
+ */
+#define GICD_CTLR_RWP			(1U << 31)
+#define GICD_CTLR_nASSGIreq		(1U << 8)
+#define GICD_CTLR_ARE_NS		(1U << 4)
+#define GICD_CTLR_ENABLE_G1A		(1U << 1)
+#define GICD_CTLR_ENABLE_G1		(1U << 0)
+
+#define GICD_TYPER_SPIS(typer)		((((typer) & 0x1f) + 1) * 32)
+#define GICD_INT_DEF_PRI_X4		0xa0a0a0a0
+
+/*
+ * Redistributor registers
+ */
+#define GICR_CTLR			0x000
+#define GICR_WAKER			0x014
+
+#define GICR_CTLR_RWP			(1U << 3)
+
+#define GICR_WAKER_ProcessorSleep	(1U << 1)
+#define GICR_WAKER_ChildrenAsleep	(1U << 2)
+
+/*
+ * Redistributor registers, offsets from SGI base
+ */
+#define GICR_IGROUPR0			GICD_IGROUPR
+#define GICR_ISENABLER0			GICD_ISENABLER
+#define GICR_ICENABLER0			GICD_ICENABLER
+#define GICR_ICACTIVER0			GICD_ICACTIVER
+#define GICR_IPRIORITYR0		GICD_IPRIORITYR
+
+/* CPU interface registers */
+#define SYS_ICC_PMR_EL1			sys_reg(3, 0, 4, 6, 0)
+#define SYS_ICC_IAR1_EL1		sys_reg(3, 0, 12, 12, 0)
+#define SYS_ICC_EOIR1_EL1		sys_reg(3, 0, 12, 12, 1)
+#define SYS_ICC_SRE_EL1			sys_reg(3, 0, 12, 12, 5)
+#define SYS_ICC_GRPEN1_EL1		sys_reg(3, 0, 12, 12, 7)
+
+#define ICC_PMR_DEF_PRIO		0xf0
+
+#define ICC_SRE_EL1_SRE			(1U << 0)
+
+#define ICC_IGRPEN1_EL1_ENABLE		(1U << 0)
+
+#define GICV3_MAX_CPUS			512
+
+#endif /* SELFTEST_KVM_GICV3_H */
diff --git a/tools/testing/selftests/kvm/lib/aarch64/gic_v3.h b/tools/testing/selftests/kvm/lib/aarch64/gic_v3.h
deleted file mode 100644
index b51536d469a6..000000000000
--- a/tools/testing/selftests/kvm/lib/aarch64/gic_v3.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * ARM Generic Interrupt Controller (GIC) v3 specific defines
- */
-
-#ifndef SELFTEST_KVM_GICV3_H
-#define SELFTEST_KVM_GICV3_H
-
-#include <asm/sysreg.h>
-
-/*
- * Distributor registers
- */
-#define GICD_CTLR			0x0000
-#define GICD_TYPER			0x0004
-#define GICD_IGROUPR			0x0080
-#define GICD_ISENABLER			0x0100
-#define GICD_ICENABLER			0x0180
-#define GICD_ICACTIVER			0x0380
-#define GICD_IPRIORITYR			0x0400
-
-/*
- * The assumption is that the guest runs in a non-secure mode.
- * The following bits of GICD_CTLR are defined accordingly.
- */
-#define GICD_CTLR_RWP			(1U << 31)
-#define GICD_CTLR_nASSGIreq		(1U << 8)
-#define GICD_CTLR_ARE_NS		(1U << 4)
-#define GICD_CTLR_ENABLE_G1A		(1U << 1)
-#define GICD_CTLR_ENABLE_G1		(1U << 0)
-
-#define GICD_TYPER_SPIS(typer)		((((typer) & 0x1f) + 1) * 32)
-#define GICD_INT_DEF_PRI_X4		0xa0a0a0a0
-
-/*
- * Redistributor registers
- */
-#define GICR_CTLR			0x000
-#define GICR_WAKER			0x014
-
-#define GICR_CTLR_RWP			(1U << 3)
-
-#define GICR_WAKER_ProcessorSleep	(1U << 1)
-#define GICR_WAKER_ChildrenAsleep	(1U << 2)
-
-/*
- * Redistributor registers, offsets from SGI base
- */
-#define GICR_IGROUPR0			GICD_IGROUPR
-#define GICR_ISENABLER0			GICD_ISENABLER
-#define GICR_ICENABLER0			GICD_ICENABLER
-#define GICR_ICACTIVER0			GICD_ICACTIVER
-#define GICR_IPRIORITYR0		GICD_IPRIORITYR
-
-/* CPU interface registers */
-#define SYS_ICC_PMR_EL1			sys_reg(3, 0, 4, 6, 0)
-#define SYS_ICC_IAR1_EL1		sys_reg(3, 0, 12, 12, 0)
-#define SYS_ICC_EOIR1_EL1		sys_reg(3, 0, 12, 12, 1)
-#define SYS_ICC_SRE_EL1			sys_reg(3, 0, 12, 12, 5)
-#define SYS_ICC_GRPEN1_EL1		sys_reg(3, 0, 12, 12, 7)
-
-#define ICC_PMR_DEF_PRIO		0xf0
-
-#define ICC_SRE_EL1_SRE			(1U << 0)
-
-#define ICC_IGRPEN1_EL1_ENABLE		(1U << 0)
-
-#define GICV3_MAX_CPUS			512
-
-#endif /* SELFTEST_KVM_GICV3_H */
-- 
cgit v1.2.3


From 745068367ccbf33d69cf4acf7b1a3d5478978e8e Mon Sep 17 00:00:00 2001
From: Ricardo Koller <ricarkol@google.com>
Date: Mon, 8 Nov 2021 18:38:51 -0800
Subject: KVM: selftests: aarch64: Add function for accessing GICv3 dist and
 redist registers

Add a generic library function for reading and writing GICv3 distributor
and redistributor registers. Then adapt some functions to use it; more
will come and use it in the next commit.

Signed-off-by: Ricardo Koller <ricarkol@google.com>
Acked-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20211109023906.1091208-3-ricarkol@google.com
---
 tools/testing/selftests/kvm/lib/aarch64/gic_v3.c | 124 ++++++++++++++++++-----
 1 file changed, 101 insertions(+), 23 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c b/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c
index 2dbf3339b62e..00e944fd8148 100644
--- a/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c
+++ b/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c
@@ -19,7 +19,8 @@ struct gicv3_data {
 	unsigned int nr_spis;
 };
 
-#define sgi_base_from_redist(redist_base) (redist_base + SZ_64K)
+#define sgi_base_from_redist(redist_base) 	(redist_base + SZ_64K)
+#define DIST_BIT				(1U << 31)
 
 enum gicv3_intid_range {
 	SGI_RANGE,
@@ -50,6 +51,14 @@ static void gicv3_gicr_wait_for_rwp(void *redist_base)
 	}
 }
 
+static void gicv3_wait_for_rwp(uint32_t cpu_or_dist)
+{
+	if (cpu_or_dist & DIST_BIT)
+		gicv3_gicd_wait_for_rwp();
+	else
+		gicv3_gicr_wait_for_rwp(gicv3_data.redist_base[cpu_or_dist]);
+}
+
 static enum gicv3_intid_range get_intid_range(unsigned int intid)
 {
 	switch (intid) {
@@ -81,39 +90,108 @@ static void gicv3_write_eoir(uint32_t irq)
 	isb();
 }
 
-static void
-gicv3_config_irq(unsigned int intid, unsigned int offset)
+uint32_t gicv3_reg_readl(uint32_t cpu_or_dist, uint64_t offset)
+{
+	void *base = cpu_or_dist & DIST_BIT ? gicv3_data.dist_base
+		: sgi_base_from_redist(gicv3_data.redist_base[cpu_or_dist]);
+	return readl(base + offset);
+}
+
+void gicv3_reg_writel(uint32_t cpu_or_dist, uint64_t offset, uint32_t reg_val)
+{
+	void *base = cpu_or_dist & DIST_BIT ? gicv3_data.dist_base
+		: sgi_base_from_redist(gicv3_data.redist_base[cpu_or_dist]);
+	writel(reg_val, base + offset);
+}
+
+uint32_t gicv3_getl_fields(uint32_t cpu_or_dist, uint64_t offset, uint32_t mask)
+{
+	return gicv3_reg_readl(cpu_or_dist, offset) & mask;
+}
+
+void gicv3_setl_fields(uint32_t cpu_or_dist, uint64_t offset,
+		uint32_t mask, uint32_t reg_val)
+{
+	uint32_t tmp = gicv3_reg_readl(cpu_or_dist, offset) & ~mask;
+
+	tmp |= (reg_val & mask);
+	gicv3_reg_writel(cpu_or_dist, offset, tmp);
+}
+
+/*
+ * We use a single offset for the distributor and redistributor maps as they
+ * have the same value in both. The only exceptions are registers that only
+ * exist in one and not the other, like GICR_WAKER that doesn't exist in the
+ * distributor map. Such registers are conveniently marked as reserved in the
+ * map that doesn't implement it; like GICR_WAKER's offset of 0x0014 being
+ * marked as "Reserved" in the Distributor map.
+ */
+static void gicv3_access_reg(uint32_t intid, uint64_t offset,
+		uint32_t reg_bits, uint32_t bits_per_field,
+		bool write, uint32_t *val)
 {
 	uint32_t cpu = guest_get_vcpuid();
-	uint32_t mask = 1 << (intid % 32);
 	enum gicv3_intid_range intid_range = get_intid_range(intid);
-	void *reg;
-
-	/* We care about 'cpu' only for SGIs or PPIs */
-	if (intid_range == SGI_RANGE || intid_range == PPI_RANGE) {
-		GUEST_ASSERT(cpu < gicv3_data.nr_cpus);
-
-		reg = sgi_base_from_redist(gicv3_data.redist_base[cpu]) +
-			offset;
-		writel(mask, reg);
-		gicv3_gicr_wait_for_rwp(gicv3_data.redist_base[cpu]);
-	} else if (intid_range == SPI_RANGE) {
-		reg = gicv3_data.dist_base + offset + (intid / 32) * 4;
-		writel(mask, reg);
-		gicv3_gicd_wait_for_rwp();
-	} else {
-		GUEST_ASSERT(0);
-	}
+	uint32_t fields_per_reg, index, mask, shift;
+	uint32_t cpu_or_dist;
+
+	GUEST_ASSERT(bits_per_field <= reg_bits);
+	GUEST_ASSERT(*val < (1U << bits_per_field));
+	/* Some registers like IROUTER are 64 bit long. Those are currently not
+	 * supported by readl nor writel, so just asserting here until then.
+	 */
+	GUEST_ASSERT(reg_bits == 32);
+
+	fields_per_reg = reg_bits / bits_per_field;
+	index = intid % fields_per_reg;
+	shift = index * bits_per_field;
+	mask = ((1U << bits_per_field) - 1) << shift;
+
+	/* Set offset to the actual register holding intid's config. */
+	offset += (intid / fields_per_reg) * (reg_bits / 8);
+
+	cpu_or_dist = (intid_range == SPI_RANGE) ? DIST_BIT : cpu;
+
+	if (write)
+		gicv3_setl_fields(cpu_or_dist, offset, mask, *val << shift);
+	*val = gicv3_getl_fields(cpu_or_dist, offset, mask) >> shift;
+}
+
+static void gicv3_write_reg(uint32_t intid, uint64_t offset,
+		uint32_t reg_bits, uint32_t bits_per_field, uint32_t val)
+{
+	gicv3_access_reg(intid, offset, reg_bits,
+			bits_per_field, true, &val);
+}
+
+static uint32_t gicv3_read_reg(uint32_t intid, uint64_t offset,
+		uint32_t reg_bits, uint32_t bits_per_field)
+{
+	uint32_t val;
+
+	gicv3_access_reg(intid, offset, reg_bits,
+			bits_per_field, false, &val);
+	return val;
 }
 
 static void gicv3_irq_enable(unsigned int intid)
 {
-	gicv3_config_irq(intid, GICD_ISENABLER);
+	bool is_spi = get_intid_range(intid) == SPI_RANGE;
+	unsigned int val = 1;
+	uint32_t cpu = guest_get_vcpuid();
+
+	gicv3_write_reg(intid, GICD_ISENABLER, 32, 1, val);
+	gicv3_wait_for_rwp(is_spi ? DIST_BIT : cpu);
 }
 
 static void gicv3_irq_disable(unsigned int intid)
 {
-	gicv3_config_irq(intid, GICD_ICENABLER);
+	bool is_spi = get_intid_range(intid) == SPI_RANGE;
+	uint32_t val = 1;
+	uint32_t cpu = guest_get_vcpuid();
+
+	gicv3_write_reg(intid, GICD_ICENABLER, 32, 1, val);
+	gicv3_wait_for_rwp(is_spi ? DIST_BIT : cpu);
 }
 
 static void gicv3_enable_redist(void *redist_base)
-- 
cgit v1.2.3


From 17ce617bf76a7c1d3b553ed01607706434b9ed59 Mon Sep 17 00:00:00 2001
From: Ricardo Koller <ricarkol@google.com>
Date: Mon, 8 Nov 2021 18:38:52 -0800
Subject: KVM: selftests: aarch64: Add GICv3 register accessor library
 functions

Add library functions for accessing GICv3 registers: DIR, PMR, CTLR,
ISACTIVER, ISPENDR.

Signed-off-by: Ricardo Koller <ricarkol@google.com>
Acked-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20211109023906.1091208-4-ricarkol@google.com
---
 tools/testing/selftests/kvm/include/aarch64/gic.h  | 16 ++++
 .../testing/selftests/kvm/include/aarch64/gic_v3.h | 12 +++
 tools/testing/selftests/kvm/lib/aarch64/gic.c      | 66 ++++++++++++++++
 .../selftests/kvm/lib/aarch64/gic_private.h        | 11 +++
 tools/testing/selftests/kvm/lib/aarch64/gic_v3.c   | 90 ++++++++++++++++++++--
 5 files changed, 189 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/aarch64/gic.h b/tools/testing/selftests/kvm/include/aarch64/gic.h
index 85dd1e53048e..c932cf3d0771 100644
--- a/tools/testing/selftests/kvm/include/aarch64/gic.h
+++ b/tools/testing/selftests/kvm/include/aarch64/gic.h
@@ -17,5 +17,21 @@ void gic_irq_enable(unsigned int intid);
 void gic_irq_disable(unsigned int intid);
 unsigned int gic_get_and_ack_irq(void);
 void gic_set_eoi(unsigned int intid);
+void gic_set_dir(unsigned int intid);
+
+/*
+ * Sets the EOI mode. When split is false, EOI just drops the priority. When
+ * split is true, EOI drops the priority and deactivates the interrupt.
+ */
+void gic_set_eoi_split(bool split);
+void gic_set_priority_mask(uint64_t mask);
+void gic_set_priority(uint32_t intid, uint32_t prio);
+void gic_irq_set_active(unsigned int intid);
+void gic_irq_clear_active(unsigned int intid);
+bool gic_irq_get_active(unsigned int intid);
+void gic_irq_set_pending(unsigned int intid);
+void gic_irq_clear_pending(unsigned int intid);
+bool gic_irq_get_pending(unsigned int intid);
+void gic_irq_set_config(unsigned int intid, bool is_edge);
 
 #endif /* SELFTEST_KVM_GIC_H */
diff --git a/tools/testing/selftests/kvm/include/aarch64/gic_v3.h b/tools/testing/selftests/kvm/include/aarch64/gic_v3.h
index b51536d469a6..ba0886e8a2bb 100644
--- a/tools/testing/selftests/kvm/include/aarch64/gic_v3.h
+++ b/tools/testing/selftests/kvm/include/aarch64/gic_v3.h
@@ -16,8 +16,12 @@
 #define GICD_IGROUPR			0x0080
 #define GICD_ISENABLER			0x0100
 #define GICD_ICENABLER			0x0180
+#define GICD_ISPENDR			0x0200
+#define GICD_ICPENDR			0x0280
 #define GICD_ICACTIVER			0x0380
+#define GICD_ISACTIVER			0x0300
 #define GICD_IPRIORITYR			0x0400
+#define GICD_ICFGR			0x0C00
 
 /*
  * The assumption is that the guest runs in a non-secure mode.
@@ -49,16 +53,24 @@
 #define GICR_IGROUPR0			GICD_IGROUPR
 #define GICR_ISENABLER0			GICD_ISENABLER
 #define GICR_ICENABLER0			GICD_ICENABLER
+#define GICR_ISPENDR0			GICD_ISPENDR
+#define GICR_ISACTIVER0			GICD_ISACTIVER
 #define GICR_ICACTIVER0			GICD_ICACTIVER
+#define GICR_ICENABLER			GICD_ICENABLER
+#define GICR_ICACTIVER			GICD_ICACTIVER
 #define GICR_IPRIORITYR0		GICD_IPRIORITYR
 
 /* CPU interface registers */
 #define SYS_ICC_PMR_EL1			sys_reg(3, 0, 4, 6, 0)
 #define SYS_ICC_IAR1_EL1		sys_reg(3, 0, 12, 12, 0)
 #define SYS_ICC_EOIR1_EL1		sys_reg(3, 0, 12, 12, 1)
+#define SYS_ICC_DIR_EL1			sys_reg(3, 0, 12, 11, 1)
+#define SYS_ICC_CTLR_EL1		sys_reg(3, 0, 12, 12, 4)
 #define SYS_ICC_SRE_EL1			sys_reg(3, 0, 12, 12, 5)
 #define SYS_ICC_GRPEN1_EL1		sys_reg(3, 0, 12, 12, 7)
 
+#define SYS_ICV_AP1R0_EL1		sys_reg(3, 0, 12, 9, 0)
+
 #define ICC_PMR_DEF_PRIO		0xf0
 
 #define ICC_SRE_EL1_SRE			(1U << 0)
diff --git a/tools/testing/selftests/kvm/lib/aarch64/gic.c b/tools/testing/selftests/kvm/lib/aarch64/gic.c
index fff4fc27504d..55668631d546 100644
--- a/tools/testing/selftests/kvm/lib/aarch64/gic.c
+++ b/tools/testing/selftests/kvm/lib/aarch64/gic.c
@@ -93,3 +93,69 @@ void gic_set_eoi(unsigned int intid)
 	GUEST_ASSERT(gic_common_ops);
 	gic_common_ops->gic_write_eoir(intid);
 }
+
+void gic_set_dir(unsigned int intid)
+{
+	GUEST_ASSERT(gic_common_ops);
+	gic_common_ops->gic_write_dir(intid);
+}
+
+void gic_set_eoi_split(bool split)
+{
+	GUEST_ASSERT(gic_common_ops);
+	gic_common_ops->gic_set_eoi_split(split);
+}
+
+void gic_set_priority_mask(uint64_t pmr)
+{
+	GUEST_ASSERT(gic_common_ops);
+	gic_common_ops->gic_set_priority_mask(pmr);
+}
+
+void gic_set_priority(unsigned int intid, unsigned int prio)
+{
+	GUEST_ASSERT(gic_common_ops);
+	gic_common_ops->gic_set_priority(intid, prio);
+}
+
+void gic_irq_set_active(unsigned int intid)
+{
+	GUEST_ASSERT(gic_common_ops);
+	gic_common_ops->gic_irq_set_active(intid);
+}
+
+void gic_irq_clear_active(unsigned int intid)
+{
+	GUEST_ASSERT(gic_common_ops);
+	gic_common_ops->gic_irq_clear_active(intid);
+}
+
+bool gic_irq_get_active(unsigned int intid)
+{
+	GUEST_ASSERT(gic_common_ops);
+	return gic_common_ops->gic_irq_get_active(intid);
+}
+
+void gic_irq_set_pending(unsigned int intid)
+{
+	GUEST_ASSERT(gic_common_ops);
+	gic_common_ops->gic_irq_set_pending(intid);
+}
+
+void gic_irq_clear_pending(unsigned int intid)
+{
+	GUEST_ASSERT(gic_common_ops);
+	gic_common_ops->gic_irq_clear_pending(intid);
+}
+
+bool gic_irq_get_pending(unsigned int intid)
+{
+	GUEST_ASSERT(gic_common_ops);
+	return gic_common_ops->gic_irq_get_pending(intid);
+}
+
+void gic_irq_set_config(unsigned int intid, bool is_edge)
+{
+	GUEST_ASSERT(gic_common_ops);
+	gic_common_ops->gic_irq_set_config(intid, is_edge);
+}
diff --git a/tools/testing/selftests/kvm/lib/aarch64/gic_private.h b/tools/testing/selftests/kvm/lib/aarch64/gic_private.h
index d81d739433dc..75d07313c893 100644
--- a/tools/testing/selftests/kvm/lib/aarch64/gic_private.h
+++ b/tools/testing/selftests/kvm/lib/aarch64/gic_private.h
@@ -14,6 +14,17 @@ struct gic_common_ops {
 	void (*gic_irq_disable)(unsigned int intid);
 	uint64_t (*gic_read_iar)(void);
 	void (*gic_write_eoir)(uint32_t irq);
+	void (*gic_write_dir)(uint32_t irq);
+	void (*gic_set_eoi_split)(bool split);
+	void (*gic_set_priority_mask)(uint64_t mask);
+	void (*gic_set_priority)(uint32_t intid, uint32_t prio);
+	void (*gic_irq_set_active)(uint32_t intid);
+	void (*gic_irq_clear_active)(uint32_t intid);
+	bool (*gic_irq_get_active)(uint32_t intid);
+	void (*gic_irq_set_pending)(uint32_t intid);
+	void (*gic_irq_clear_pending)(uint32_t intid);
+	bool (*gic_irq_get_pending)(uint32_t intid);
+	void (*gic_irq_set_config)(uint32_t intid, bool is_edge);
 };
 
 extern const struct gic_common_ops gicv3_ops;
diff --git a/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c b/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c
index 00e944fd8148..00f613c0583c 100644
--- a/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c
+++ b/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c
@@ -90,6 +90,29 @@ static void gicv3_write_eoir(uint32_t irq)
 	isb();
 }
 
+static void gicv3_write_dir(uint32_t irq)
+{
+	write_sysreg_s(irq, SYS_ICC_DIR_EL1);
+	isb();
+}
+
+static void gicv3_set_priority_mask(uint64_t mask)
+{
+	write_sysreg_s(mask, SYS_ICC_PMR_EL1);
+}
+
+static void gicv3_set_eoi_split(bool split)
+{
+	uint32_t val;
+
+	/* All other fields are read-only, so no need to read CTLR first. In
+	 * fact, the kernel does the same.
+	 */
+	val = split ? (1U << 1) : 0;
+	write_sysreg_s(val, SYS_ICC_CTLR_EL1);
+	isb();
+}
+
 uint32_t gicv3_reg_readl(uint32_t cpu_or_dist, uint64_t offset)
 {
 	void *base = cpu_or_dist & DIST_BIT ? gicv3_data.dist_base
@@ -174,26 +197,70 @@ static uint32_t gicv3_read_reg(uint32_t intid, uint64_t offset,
 	return val;
 }
 
-static void gicv3_irq_enable(unsigned int intid)
+static void gicv3_set_priority(uint32_t intid, uint32_t prio)
+{
+	gicv3_write_reg(intid, GICD_IPRIORITYR, 32, 8, prio);
+}
+
+/* Sets the intid to be level-sensitive or edge-triggered. */
+static void gicv3_irq_set_config(uint32_t intid, bool is_edge)
+{
+	uint32_t val;
+
+	/* N/A for private interrupts. */
+	GUEST_ASSERT(get_intid_range(intid) == SPI_RANGE);
+	val = is_edge ? 2 : 0;
+	gicv3_write_reg(intid, GICD_ICFGR, 32, 2, val);
+}
+
+static void gicv3_irq_enable(uint32_t intid)
 {
 	bool is_spi = get_intid_range(intid) == SPI_RANGE;
-	unsigned int val = 1;
 	uint32_t cpu = guest_get_vcpuid();
 
-	gicv3_write_reg(intid, GICD_ISENABLER, 32, 1, val);
+	gicv3_write_reg(intid, GICD_ISENABLER, 32, 1, 1);
 	gicv3_wait_for_rwp(is_spi ? DIST_BIT : cpu);
 }
 
-static void gicv3_irq_disable(unsigned int intid)
+static void gicv3_irq_disable(uint32_t intid)
 {
 	bool is_spi = get_intid_range(intid) == SPI_RANGE;
-	uint32_t val = 1;
 	uint32_t cpu = guest_get_vcpuid();
 
-	gicv3_write_reg(intid, GICD_ICENABLER, 32, 1, val);
+	gicv3_write_reg(intid, GICD_ICENABLER, 32, 1, 1);
 	gicv3_wait_for_rwp(is_spi ? DIST_BIT : cpu);
 }
 
+static void gicv3_irq_set_active(uint32_t intid)
+{
+	gicv3_write_reg(intid, GICD_ISACTIVER, 32, 1, 1);
+}
+
+static void gicv3_irq_clear_active(uint32_t intid)
+{
+	gicv3_write_reg(intid, GICD_ICACTIVER, 32, 1, 1);
+}
+
+static bool gicv3_irq_get_active(uint32_t intid)
+{
+	return gicv3_read_reg(intid, GICD_ISACTIVER, 32, 1);
+}
+
+static void gicv3_irq_set_pending(uint32_t intid)
+{
+	gicv3_write_reg(intid, GICD_ISPENDR, 32, 1, 1);
+}
+
+static void gicv3_irq_clear_pending(uint32_t intid)
+{
+	gicv3_write_reg(intid, GICD_ICPENDR, 32, 1, 1);
+}
+
+static bool gicv3_irq_get_pending(uint32_t intid)
+{
+	return gicv3_read_reg(intid, GICD_ISPENDR, 32, 1);
+}
+
 static void gicv3_enable_redist(void *redist_base)
 {
 	uint32_t val = readl(redist_base + GICR_WAKER);
@@ -315,4 +382,15 @@ const struct gic_common_ops gicv3_ops = {
 	.gic_irq_disable = gicv3_irq_disable,
 	.gic_read_iar = gicv3_read_iar,
 	.gic_write_eoir = gicv3_write_eoir,
+	.gic_write_dir = gicv3_write_dir,
+	.gic_set_priority_mask = gicv3_set_priority_mask,
+	.gic_set_eoi_split = gicv3_set_eoi_split,
+	.gic_set_priority = gicv3_set_priority,
+	.gic_irq_set_active = gicv3_irq_set_active,
+	.gic_irq_clear_active = gicv3_irq_clear_active,
+	.gic_irq_get_active = gicv3_irq_get_active,
+	.gic_irq_set_pending = gicv3_irq_set_pending,
+	.gic_irq_clear_pending = gicv3_irq_clear_pending,
+	.gic_irq_get_pending = gicv3_irq_get_pending,
+	.gic_irq_set_config = gicv3_irq_set_config,
 };
-- 
cgit v1.2.3


From 227895ed6d03b46fa619614a41a3b8e1074d6151 Mon Sep 17 00:00:00 2001
From: Ricardo Koller <ricarkol@google.com>
Date: Mon, 8 Nov 2021 18:38:53 -0800
Subject: KVM: selftests: Add kvm_irq_line library function

Add an architecture independent wrapper function for the KVM_IRQ_LINE
ioctl.

Signed-off-by: Ricardo Koller <ricarkol@google.com>
Acked-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20211109023906.1091208-5-ricarkol@google.com
---
 tools/testing/selftests/kvm/include/kvm_util.h |  2 ++
 tools/testing/selftests/kvm/lib/kvm_util.c     | 21 +++++++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 6a1a37f30494..4ba8fd44cf95 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -240,6 +240,8 @@ int _kvm_device_access(int dev_fd, uint32_t group, uint64_t attr,
 		       void *val, bool write);
 int kvm_device_access(int dev_fd, uint32_t group, uint64_t attr,
 		      void *val, bool write);
+void kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level);
+int _kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level);
 
 int _vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group,
 			  uint64_t attr);
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 8f2e0bb1ef96..9f7210c2d75e 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -2086,6 +2086,27 @@ int vcpu_access_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group,
 	return ret;
 }
 
+/*
+ * IRQ related functions.
+ */
+
+int _kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level)
+{
+	struct kvm_irq_level irq_level = {
+		.irq    = irq,
+		.level  = level,
+	};
+
+	return _vm_ioctl(vm, KVM_IRQ_LINE, &irq_level);
+}
+
+void kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level)
+{
+	int ret = _kvm_irq_line(vm, irq, level);
+
+	TEST_ASSERT(ret >= 0, "KVM_IRQ_LINE failed, rc: %i errno: %i", ret, errno);
+}
+
 /*
  * VM Dump
  *
-- 
cgit v1.2.3


From e95def3a904dea467309bbe382a9032d301ba9cd Mon Sep 17 00:00:00 2001
From: Ricardo Koller <ricarkol@google.com>
Date: Mon, 8 Nov 2021 18:38:54 -0800
Subject: KVM: selftests: aarch64: Add vGIC library functions to deal with vIRQ
 state

Add a set of library functions for userspace code in selftests to deal
with vIRQ state (i.e., ioctl wrappers).

Signed-off-by: Ricardo Koller <ricarkol@google.com>
Acked-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20211109023906.1091208-6-ricarkol@google.com
---
 tools/testing/selftests/kvm/include/aarch64/gic.h  | 10 +++
 tools/testing/selftests/kvm/include/aarch64/vgic.h | 14 +++-
 tools/testing/selftests/kvm/lib/aarch64/vgic.c     | 93 ++++++++++++++++++++++
 3 files changed, 116 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/aarch64/gic.h b/tools/testing/selftests/kvm/include/aarch64/gic.h
index c932cf3d0771..b217ea17cac5 100644
--- a/tools/testing/selftests/kvm/include/aarch64/gic.h
+++ b/tools/testing/selftests/kvm/include/aarch64/gic.h
@@ -11,6 +11,16 @@ enum gic_type {
 	GIC_TYPE_MAX,
 };
 
+#define MIN_SGI			0
+#define MIN_PPI			16
+#define MIN_SPI			32
+#define MAX_SPI			1019
+#define IAR_SPURIOUS		1023
+
+#define INTID_IS_SGI(intid)	(0       <= (intid) && (intid) < MIN_PPI)
+#define INTID_IS_PPI(intid)	(MIN_PPI <= (intid) && (intid) < MIN_SPI)
+#define INTID_IS_SPI(intid)	(MIN_SPI <= (intid) && (intid) <= MAX_SPI)
+
 void gic_init(enum gic_type type, unsigned int nr_cpus,
 		void *dist_base, void *redist_base);
 void gic_irq_enable(unsigned int intid);
diff --git a/tools/testing/selftests/kvm/include/aarch64/vgic.h b/tools/testing/selftests/kvm/include/aarch64/vgic.h
index 0ecfb253893c..ec8744bb2d4b 100644
--- a/tools/testing/selftests/kvm/include/aarch64/vgic.h
+++ b/tools/testing/selftests/kvm/include/aarch64/vgic.h
@@ -17,4 +17,16 @@
 int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus,
 		uint64_t gicd_base_gpa, uint64_t gicr_base_gpa);
 
-#endif /* SELFTEST_KVM_VGIC_H */
+#define VGIC_MAX_RESERVED	1023
+
+void kvm_irq_set_level_info(int gic_fd, uint32_t intid, int level);
+int _kvm_irq_set_level_info(int gic_fd, uint32_t intid, int level);
+
+void kvm_arm_irq_line(struct kvm_vm *vm, uint32_t intid, int level);
+int _kvm_arm_irq_line(struct kvm_vm *vm, uint32_t intid, int level);
+
+/* The vcpu arg only applies to private interrupts. */
+void kvm_irq_write_ispendr(int gic_fd, uint32_t intid, uint32_t vcpu);
+void kvm_irq_write_isactiver(int gic_fd, uint32_t intid, uint32_t vcpu);
+
+#endif // SELFTEST_KVM_VGIC_H
diff --git a/tools/testing/selftests/kvm/lib/aarch64/vgic.c b/tools/testing/selftests/kvm/lib/aarch64/vgic.c
index b9b271ff520d..a1f1f6c8e2e0 100644
--- a/tools/testing/selftests/kvm/lib/aarch64/vgic.c
+++ b/tools/testing/selftests/kvm/lib/aarch64/vgic.c
@@ -5,11 +5,14 @@
 
 #include <linux/kvm.h>
 #include <linux/sizes.h>
+#include <asm/kvm_para.h>
 #include <asm/kvm.h>
 
 #include "kvm_util.h"
 #include "../kvm_util_internal.h"
 #include "vgic.h"
+#include "gic.h"
+#include "gic_v3.h"
 
 /*
  * vGIC-v3 default host setup
@@ -68,3 +71,93 @@ int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus,
 
 	return gic_fd;
 }
+
+/* should only work for level sensitive interrupts */
+int _kvm_irq_set_level_info(int gic_fd, uint32_t intid, int level)
+{
+	uint64_t attr = 32 * (intid / 32);
+	uint64_t index = intid % 32;
+	uint64_t val;
+	int ret;
+
+	ret = _kvm_device_access(gic_fd, KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO,
+				 attr, &val, false);
+	if (ret != 0)
+		return ret;
+
+	val |= 1U << index;
+	ret = _kvm_device_access(gic_fd, KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO,
+				 attr, &val, true);
+	return ret;
+}
+
+void kvm_irq_set_level_info(int gic_fd, uint32_t intid, int level)
+{
+	int ret = _kvm_irq_set_level_info(gic_fd, intid, level);
+
+	TEST_ASSERT(ret == 0, "KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO failed, "
+			"rc: %i errno: %i", ret, errno);
+}
+
+int _kvm_arm_irq_line(struct kvm_vm *vm, uint32_t intid, int level)
+{
+	uint32_t irq = intid & KVM_ARM_IRQ_NUM_MASK;
+
+	if (INTID_IS_PPI(intid))
+		irq |= KVM_ARM_IRQ_TYPE_PPI << KVM_ARM_IRQ_TYPE_SHIFT;
+	else if (INTID_IS_SPI(intid))
+		irq |= KVM_ARM_IRQ_TYPE_SPI << KVM_ARM_IRQ_TYPE_SHIFT;
+	else
+		TEST_FAIL("KVM_IRQ_LINE can't be used with SGIs.");
+
+	return _kvm_irq_line(vm, irq, level);
+}
+
+void kvm_arm_irq_line(struct kvm_vm *vm, uint32_t intid, int level)
+{
+	int ret = _kvm_arm_irq_line(vm, intid, level);
+
+	TEST_ASSERT(ret == 0, "KVM_IRQ_LINE failed, rc: %i errno: %i",
+			ret, errno);
+}
+
+static void vgic_poke_irq(int gic_fd, uint32_t intid,
+		uint32_t vcpu, uint64_t reg_off)
+{
+	uint64_t reg = intid / 32;
+	uint64_t index = intid % 32;
+	uint64_t attr = reg_off + reg * 4;
+	uint64_t val;
+	bool intid_is_private = INTID_IS_SGI(intid) || INTID_IS_PPI(intid);
+
+	/* Check that the addr part of the attr is within 32 bits. */
+	assert(attr <= KVM_DEV_ARM_VGIC_OFFSET_MASK);
+
+	uint32_t group = intid_is_private ? KVM_DEV_ARM_VGIC_GRP_REDIST_REGS
+					  : KVM_DEV_ARM_VGIC_GRP_DIST_REGS;
+
+	if (intid_is_private) {
+		/* TODO: only vcpu 0 implemented for now. */
+		assert(vcpu == 0);
+		attr += SZ_64K;
+	}
+
+	/* All calls will succeed, even with invalid intid's, as long as the
+	 * addr part of the attr is within 32 bits (checked above). An invalid
+	 * intid will just make the read/writes point to above the intended
+	 * register space (i.e., ICPENDR after ISPENDR).
+	 */
+	kvm_device_access(gic_fd, group, attr, &val, false);
+	val |= 1ULL << index;
+	kvm_device_access(gic_fd, group, attr, &val, true);
+}
+
+void kvm_irq_write_ispendr(int gic_fd, uint32_t intid, uint32_t vcpu)
+{
+	vgic_poke_irq(gic_fd, intid, vcpu, GICD_ISPENDR);
+}
+
+void kvm_irq_write_isactiver(int gic_fd, uint32_t intid, uint32_t vcpu)
+{
+	vgic_poke_irq(gic_fd, intid, vcpu, GICD_ISACTIVER);
+}
-- 
cgit v1.2.3


From 50b020cdb7f72077e16133f1d88c9359cf415a53 Mon Sep 17 00:00:00 2001
From: Ricardo Koller <ricarkol@google.com>
Date: Mon, 8 Nov 2021 18:38:55 -0800
Subject: KVM: selftests: aarch64: Add vgic_irq to test userspace IRQ injection

Add a new KVM selftest, vgic_irq, for testing userspace IRQ injection.  This
particular test injects an SPI using KVM_IRQ_LINE on GICv3 and verifies
that the IRQ is handled in the guest. The next commits will add more
types of IRQs and different modes.

Signed-off-by: Ricardo Koller <ricarkol@google.com>
Acked-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20211109023906.1091208-7-ricarkol@google.com
---
 tools/testing/selftests/kvm/.gitignore         |   1 +
 tools/testing/selftests/kvm/Makefile           |   1 +
 tools/testing/selftests/kvm/aarch64/vgic_irq.c | 244 +++++++++++++++++++++++++
 3 files changed, 246 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/aarch64/vgic_irq.c

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index 3763105029fb..f75d4c433fc3 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -4,6 +4,7 @@
 /aarch64/get-reg-list
 /aarch64/psci_cpu_on_test
 /aarch64/vgic_init
+/aarch64/vgic_irq
 /s390x/memop
 /s390x/resets
 /s390x/sync_regs_test
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index c4e34717826a..758b5d0e352e 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -94,6 +94,7 @@ TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions
 TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list
 TEST_GEN_PROGS_aarch64 += aarch64/psci_cpu_on_test
 TEST_GEN_PROGS_aarch64 += aarch64/vgic_init
+TEST_GEN_PROGS_aarch64 += aarch64/vgic_irq
 TEST_GEN_PROGS_aarch64 += demand_paging_test
 TEST_GEN_PROGS_aarch64 += dirty_log_test
 TEST_GEN_PROGS_aarch64 += dirty_log_perf_test
diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c
new file mode 100644
index 000000000000..e13e87427038
--- /dev/null
+++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * vgic_irq.c - Test userspace injection of IRQs
+ *
+ * This test validates the injection of IRQs from userspace using various
+ * methods (e.g., KVM_IRQ_LINE) and modes (e.g., EOI). The guest "asks" the
+ * host to inject a specific intid via a GUEST_SYNC call, and then checks that
+ * it received it.
+ */
+
+#include <asm/kvm.h>
+#include <asm/kvm_para.h>
+#include <linux/sizes.h>
+
+#include "processor.h"
+#include "test_util.h"
+#include "kvm_util.h"
+#include "gic.h"
+#include "gic_v3.h"
+#include "vgic.h"
+
+#define GICD_BASE_GPA		0x08000000ULL
+#define GICR_BASE_GPA		0x080A0000ULL
+#define VCPU_ID			0
+
+/*
+ * KVM implements 32 priority levels:
+ * 0x00 (highest priority) - 0xF8 (lowest priority), in steps of 8
+ *
+ * Note that these macros will still be correct in the case that KVM implements
+ * more priority levels. Also note that 32 is the minimum for GICv3 and GICv2.
+ */
+#define KVM_NUM_PRIOS		32
+#define KVM_PRIO_SHIFT		3 /* steps of 8 = 1 << 3 */
+#define LOWEST_PRIO		(KVM_NUM_PRIOS - 1)
+#define CPU_PRIO_MASK		(LOWEST_PRIO << KVM_PRIO_SHIFT)	/* 0xf8 */
+#define IRQ_DEFAULT_PRIO	(LOWEST_PRIO - 1)
+#define IRQ_DEFAULT_PRIO_REG	(IRQ_DEFAULT_PRIO << KVM_PRIO_SHIFT) /* 0xf0 */
+
+static void *dist = (void *)GICD_BASE_GPA;
+static void *redist = (void *)GICR_BASE_GPA;
+
+/*
+ * The kvm_inject_* utilities are used by the guest to ask the host to inject
+ * interrupts (e.g., using the KVM_IRQ_LINE ioctl).
+ */
+
+typedef enum {
+	KVM_INJECT_EDGE_IRQ_LINE = 1,
+} kvm_inject_cmd;
+
+struct kvm_inject_args {
+	kvm_inject_cmd cmd;
+	uint32_t intid;
+};
+
+/* Used on the guest side to perform the hypercall. */
+static void kvm_inject_call(kvm_inject_cmd cmd, uint32_t intid);
+
+/* Used on the host side to get the hypercall info. */
+static void kvm_inject_get_call(struct kvm_vm *vm, struct ucall *uc,
+		struct kvm_inject_args *args);
+
+/* Shared between the guest main thread and the IRQ handlers. */
+volatile uint64_t irq_handled;
+volatile uint32_t irqnr_received[MAX_SPI + 1];
+
+static void reset_stats(void)
+{
+	int i;
+
+	irq_handled = 0;
+	for (i = 0; i <= MAX_SPI; i++)
+		irqnr_received[i] = 0;
+}
+
+static uint64_t gic_read_ap1r0(void)
+{
+	uint64_t reg = read_sysreg_s(SYS_ICV_AP1R0_EL1);
+
+	dsb(sy);
+	return reg;
+}
+
+static void guest_irq_handler(struct ex_regs *regs)
+{
+	uint32_t intid = gic_get_and_ack_irq();
+
+	if (intid == IAR_SPURIOUS)
+		return;
+
+	GUEST_ASSERT(gic_irq_get_active(intid));
+
+	GUEST_ASSERT(!gic_irq_get_pending(intid));
+
+	GUEST_ASSERT(intid < MAX_SPI);
+	irqnr_received[intid] += 1;
+	irq_handled += 1;
+
+	gic_set_eoi(intid);
+	GUEST_ASSERT_EQ(gic_read_ap1r0(), 0);
+
+	GUEST_ASSERT(!gic_irq_get_active(intid));
+	GUEST_ASSERT(!gic_irq_get_pending(intid));
+}
+
+static void kvm_inject_call(kvm_inject_cmd cmd, uint32_t intid)
+{
+	struct kvm_inject_args args = {
+		.cmd = cmd,
+		.intid = intid,
+	};
+	GUEST_SYNC(&args);
+}
+
+#define GUEST_ASSERT_IAR_EMPTY()						\
+do { 										\
+	uint32_t _intid;							\
+	_intid = gic_get_and_ack_irq();						\
+	GUEST_ASSERT(_intid == 0 || _intid == IAR_SPURIOUS);			\
+} while (0)
+
+static void test_kvm_irq_line(uint32_t intid)
+{
+	reset_stats();
+
+	asm volatile("msr daifset, #2" : : : "memory");
+	kvm_inject_call(KVM_INJECT_EDGE_IRQ_LINE, intid);
+
+	while (irq_handled < 1) {
+		asm volatile("wfi\n"
+			     "msr daifclr, #2\n"
+			     /* handle IRQ */
+			     "msr daifset, #2\n"
+			     : : : "memory");
+	}
+	asm volatile("msr daifclr, #2" : : : "memory");
+
+	GUEST_ASSERT_EQ(irq_handled, 1);
+	GUEST_ASSERT_EQ(irqnr_received[intid], 1);
+	GUEST_ASSERT_IAR_EMPTY();
+}
+
+static void guest_code(void)
+{
+	uint32_t i;
+	uint32_t nr_irqs = 64; /* absolute minimum number of IRQs supported. */
+
+	gic_init(GIC_V3, 1, dist, redist);
+
+	for (i = 0; i < nr_irqs; i++) {
+		gic_irq_enable(i);
+		gic_set_priority(i, IRQ_DEFAULT_PRIO_REG);
+	}
+
+	gic_set_priority_mask(CPU_PRIO_MASK);
+
+	local_irq_enable();
+
+	test_kvm_irq_line(MIN_SPI);
+
+	GUEST_DONE();
+}
+
+static void run_guest_cmd(struct kvm_vm *vm, int gic_fd,
+		struct kvm_inject_args *inject_args)
+{
+	kvm_inject_cmd cmd = inject_args->cmd;
+	uint32_t intid = inject_args->intid;
+
+	switch (cmd) {
+	case KVM_INJECT_EDGE_IRQ_LINE:
+		kvm_arm_irq_line(vm, intid, 1);
+		kvm_arm_irq_line(vm, intid, 0);
+		break;
+	default:
+		break;
+	}
+}
+
+static void kvm_inject_get_call(struct kvm_vm *vm, struct ucall *uc,
+		struct kvm_inject_args *args)
+{
+	struct kvm_inject_args *kvm_args_hva;
+	vm_vaddr_t kvm_args_gva;
+
+	kvm_args_gva = uc->args[1];
+	kvm_args_hva = (struct kvm_inject_args *)addr_gva2hva(vm, kvm_args_gva);
+	memcpy(args, kvm_args_hva, sizeof(struct kvm_inject_args));
+}
+
+
+static void test_vgic(void)
+{
+	struct ucall uc;
+	int gic_fd;
+	struct kvm_vm *vm;
+	struct kvm_inject_args inject_args;
+
+	vm = vm_create_default(VCPU_ID, 0, guest_code);
+	ucall_init(vm, NULL);
+
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(vm, VCPU_ID);
+
+	gic_fd = vgic_v3_setup(vm, 1, GICD_BASE_GPA, GICR_BASE_GPA);
+
+	vm_install_exception_handler(vm, VECTOR_IRQ_CURRENT,
+			guest_irq_handler);
+
+	while (1) {
+		vcpu_run(vm, VCPU_ID);
+
+		switch (get_ucall(vm, VCPU_ID, &uc)) {
+		case UCALL_SYNC:
+			kvm_inject_get_call(vm, &uc, &inject_args);
+			run_guest_cmd(vm, gic_fd, &inject_args);
+			break;
+		case UCALL_ABORT:
+			TEST_FAIL("%s at %s:%ld\n\tvalues: %#lx, %#lx",
+					(const char *)uc.args[0],
+					__FILE__, uc.args[1], uc.args[2], uc.args[3]);
+			break;
+		case UCALL_DONE:
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+	}
+
+done:
+	close(gic_fd);
+	kvm_vm_free(vm);
+}
+
+int main(int ac, char **av)
+{
+	/* Tell stdout not to buffer its content */
+	setbuf(stdout, NULL);
+
+	test_vgic();
+
+	return 0;
+}
-- 
cgit v1.2.3


From e1cb399eed1eda29568b17bdb75d16cee1fc3da4 Mon Sep 17 00:00:00 2001
From: Ricardo Koller <ricarkol@google.com>
Date: Mon, 8 Nov 2021 18:38:56 -0800
Subject: KVM: selftests: aarch64: Abstract the injection functions in vgic_irq

Build an abstraction around the injection functions, so the preparation
and checking around the actual injection can be shared between tests.
All functions are stored as pointers in arrays of kvm_inject_desc's
which include the pointer and what kind of interrupts they can inject.

Signed-off-by: Ricardo Koller <ricarkol@google.com>
Acked-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20211109023906.1091208-8-ricarkol@google.com
---
 tools/testing/selftests/kvm/aarch64/vgic_irq.c | 39 ++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c
index e13e87427038..f5d76fef22f0 100644
--- a/tools/testing/selftests/kvm/aarch64/vgic_irq.c
+++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c
@@ -57,10 +57,28 @@ struct kvm_inject_args {
 /* Used on the guest side to perform the hypercall. */
 static void kvm_inject_call(kvm_inject_cmd cmd, uint32_t intid);
 
+#define KVM_INJECT(cmd, intid)							\
+	kvm_inject_call(cmd, intid)
+
 /* Used on the host side to get the hypercall info. */
 static void kvm_inject_get_call(struct kvm_vm *vm, struct ucall *uc,
 		struct kvm_inject_args *args);
 
+struct kvm_inject_desc {
+	kvm_inject_cmd cmd;
+	/* can inject PPIs, PPIs, and/or SPIs. */
+	bool sgi, ppi, spi;
+};
+
+static struct kvm_inject_desc inject_edge_fns[] = {
+	/*                                      sgi    ppi    spi */
+	{ KVM_INJECT_EDGE_IRQ_LINE,		false, false, true },
+	{ 0, },
+};
+
+#define for_each_inject_fn(t, f)						\
+	for ((f) = (t); (f)->cmd; (f)++)
+
 /* Shared between the guest main thread and the IRQ handlers. */
 volatile uint64_t irq_handled;
 volatile uint32_t irqnr_received[MAX_SPI + 1];
@@ -120,12 +138,12 @@ do { 										\
 	GUEST_ASSERT(_intid == 0 || _intid == IAR_SPURIOUS);			\
 } while (0)
 
-static void test_kvm_irq_line(uint32_t intid)
+static void guest_inject(uint32_t intid, kvm_inject_cmd cmd)
 {
 	reset_stats();
 
 	asm volatile("msr daifset, #2" : : : "memory");
-	kvm_inject_call(KVM_INJECT_EDGE_IRQ_LINE, intid);
+	KVM_INJECT(cmd, intid);
 
 	while (irq_handled < 1) {
 		asm volatile("wfi\n"
@@ -141,10 +159,23 @@ static void test_kvm_irq_line(uint32_t intid)
 	GUEST_ASSERT_IAR_EMPTY();
 }
 
+static void test_injection(struct kvm_inject_desc *f)
+{
+	if (f->sgi)
+		guest_inject(MIN_SGI, f->cmd);
+
+	if (f->ppi)
+		guest_inject(MIN_PPI, f->cmd);
+
+	if (f->spi)
+		guest_inject(MIN_SPI, f->cmd);
+}
+
 static void guest_code(void)
 {
 	uint32_t i;
 	uint32_t nr_irqs = 64; /* absolute minimum number of IRQs supported. */
+	struct kvm_inject_desc *f;
 
 	gic_init(GIC_V3, 1, dist, redist);
 
@@ -157,7 +188,9 @@ static void guest_code(void)
 
 	local_irq_enable();
 
-	test_kvm_irq_line(MIN_SPI);
+	/* Start the tests. */
+	for_each_inject_fn(inject_edge_fns, f)
+		test_injection(f);
 
 	GUEST_DONE();
 }
-- 
cgit v1.2.3


From e5410ee2806d74a749fa39ca6fdb73be2f88611f Mon Sep 17 00:00:00 2001
From: Ricardo Koller <ricarkol@google.com>
Date: Mon, 8 Nov 2021 18:38:57 -0800
Subject: KVM: selftests: aarch64: Cmdline arg to set number of IRQs in
 vgic_irq test

Add the ability to specify the number of vIRQs exposed by KVM (arg
defaults to 64). Then extend the KVM_IRQ_LINE test by injecting all
available SPIs at once (specified by the nr-irqs arg). As a bonus,
inject all SGIs at once as well.

Signed-off-by: Ricardo Koller <ricarkol@google.com>
Acked-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20211109023906.1091208-9-ricarkol@google.com
---
 tools/testing/selftests/kvm/aarch64/arch_timer.c   |   2 +-
 tools/testing/selftests/kvm/aarch64/vgic_irq.c     | 149 ++++++++++++++++-----
 tools/testing/selftests/kvm/include/aarch64/vgic.h |   2 +-
 tools/testing/selftests/kvm/lib/aarch64/vgic.c     |   9 +-
 4 files changed, 127 insertions(+), 35 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c
index bf6a45b0b8dc..9ad38bd360a4 100644
--- a/tools/testing/selftests/kvm/aarch64/arch_timer.c
+++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c
@@ -382,7 +382,7 @@ static struct kvm_vm *test_vm_create(void)
 
 	ucall_init(vm, NULL);
 	test_init_timer_irq(vm);
-	vgic_v3_setup(vm, nr_vcpus, GICD_BASE_GPA, GICR_BASE_GPA);
+	vgic_v3_setup(vm, nr_vcpus, 64, GICD_BASE_GPA, GICR_BASE_GPA);
 
 	/* Make all the test's cmdline args visible to the guest */
 	sync_global_to_guest(vm, test_args);
diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c
index f5d76fef22f0..0b89a29dfe79 100644
--- a/tools/testing/selftests/kvm/aarch64/vgic_irq.c
+++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c
@@ -23,6 +23,14 @@
 #define GICR_BASE_GPA		0x080A0000ULL
 #define VCPU_ID			0
 
+/*
+ * Stores the user specified args; it's passed to the guest and to every test
+ * function.
+ */
+struct test_args {
+	uint32_t nr_irqs; /* number of KVM supported IRQs. */
+};
+
 /*
  * KVM implements 32 priority levels:
  * 0x00 (highest priority) - 0xF8 (lowest priority), in steps of 8
@@ -51,14 +59,18 @@ typedef enum {
 
 struct kvm_inject_args {
 	kvm_inject_cmd cmd;
-	uint32_t intid;
+	uint32_t first_intid;
+	uint32_t num;
 };
 
 /* Used on the guest side to perform the hypercall. */
-static void kvm_inject_call(kvm_inject_cmd cmd, uint32_t intid);
+static void kvm_inject_call(kvm_inject_cmd cmd, uint32_t first_intid, uint32_t num);
 
 #define KVM_INJECT(cmd, intid)							\
-	kvm_inject_call(cmd, intid)
+	kvm_inject_call(cmd, intid, 1)
+
+#define KVM_INJECT_MULTI(cmd, intid, num)					\
+	kvm_inject_call(cmd, intid, num)
 
 /* Used on the host side to get the hypercall info. */
 static void kvm_inject_get_call(struct kvm_vm *vm, struct ucall *uc,
@@ -122,11 +134,12 @@ static void guest_irq_handler(struct ex_regs *regs)
 	GUEST_ASSERT(!gic_irq_get_pending(intid));
 }
 
-static void kvm_inject_call(kvm_inject_cmd cmd, uint32_t intid)
+static void kvm_inject_call(kvm_inject_cmd cmd, uint32_t first_intid, uint32_t num)
 {
 	struct kvm_inject_args args = {
 		.cmd = cmd,
-		.intid = intid,
+		.first_intid = first_intid,
+		.num = num,
 	};
 	GUEST_SYNC(&args);
 }
@@ -138,14 +151,30 @@ do { 										\
 	GUEST_ASSERT(_intid == 0 || _intid == IAR_SPURIOUS);			\
 } while (0)
 
-static void guest_inject(uint32_t intid, kvm_inject_cmd cmd)
+static void reset_priorities(struct test_args *args)
+{
+	int i;
+
+	for (i = 0; i < args->nr_irqs; i++)
+		gic_set_priority(i, IRQ_DEFAULT_PRIO_REG);
+}
+
+static void guest_inject(struct test_args *args,
+		uint32_t first_intid, uint32_t num,
+		kvm_inject_cmd cmd)
 {
+	uint32_t i;
+
 	reset_stats();
 
+	/* Cycle over all priorities to make things more interesting. */
+	for (i = first_intid; i < num + first_intid; i++)
+		gic_set_priority(i, (i % (KVM_NUM_PRIOS - 1)) << 3);
+
 	asm volatile("msr daifset, #2" : : : "memory");
-	KVM_INJECT(cmd, intid);
+	KVM_INJECT_MULTI(cmd, first_intid, num);
 
-	while (irq_handled < 1) {
+	while (irq_handled < num) {
 		asm volatile("wfi\n"
 			     "msr daifclr, #2\n"
 			     /* handle IRQ */
@@ -154,57 +183,72 @@ static void guest_inject(uint32_t intid, kvm_inject_cmd cmd)
 	}
 	asm volatile("msr daifclr, #2" : : : "memory");
 
-	GUEST_ASSERT_EQ(irq_handled, 1);
-	GUEST_ASSERT_EQ(irqnr_received[intid], 1);
+	GUEST_ASSERT_EQ(irq_handled, num);
+	for (i = first_intid; i < num + first_intid; i++)
+		GUEST_ASSERT_EQ(irqnr_received[i], 1);
 	GUEST_ASSERT_IAR_EMPTY();
+
+	reset_priorities(args);
 }
 
-static void test_injection(struct kvm_inject_desc *f)
+static void test_injection(struct test_args *args, struct kvm_inject_desc *f)
 {
-	if (f->sgi)
-		guest_inject(MIN_SGI, f->cmd);
+	uint32_t nr_irqs = args->nr_irqs;
+
+	if (f->sgi) {
+		guest_inject(args, MIN_SGI, 1, f->cmd);
+		guest_inject(args, 0, 16, f->cmd);
+	}
 
 	if (f->ppi)
-		guest_inject(MIN_PPI, f->cmd);
+		guest_inject(args, MIN_PPI, 1, f->cmd);
 
-	if (f->spi)
-		guest_inject(MIN_SPI, f->cmd);
+	if (f->spi) {
+		guest_inject(args, MIN_SPI, 1, f->cmd);
+		guest_inject(args, nr_irqs - 1, 1, f->cmd);
+		guest_inject(args, MIN_SPI, nr_irqs - MIN_SPI, f->cmd);
+	}
 }
 
-static void guest_code(void)
+static void guest_code(struct test_args args)
 {
-	uint32_t i;
-	uint32_t nr_irqs = 64; /* absolute minimum number of IRQs supported. */
+	uint32_t i, nr_irqs = args.nr_irqs;
 	struct kvm_inject_desc *f;
 
 	gic_init(GIC_V3, 1, dist, redist);
 
-	for (i = 0; i < nr_irqs; i++) {
+	for (i = 0; i < nr_irqs; i++)
 		gic_irq_enable(i);
-		gic_set_priority(i, IRQ_DEFAULT_PRIO_REG);
-	}
 
+	reset_priorities(&args);
 	gic_set_priority_mask(CPU_PRIO_MASK);
 
 	local_irq_enable();
 
 	/* Start the tests. */
 	for_each_inject_fn(inject_edge_fns, f)
-		test_injection(f);
+		test_injection(&args, f);
 
 	GUEST_DONE();
 }
 
 static void run_guest_cmd(struct kvm_vm *vm, int gic_fd,
-		struct kvm_inject_args *inject_args)
+		struct kvm_inject_args *inject_args,
+		struct test_args *test_args)
 {
 	kvm_inject_cmd cmd = inject_args->cmd;
-	uint32_t intid = inject_args->intid;
+	uint32_t intid = inject_args->first_intid;
+	uint32_t num = inject_args->num;
+	uint32_t i;
+
+	assert(intid < UINT_MAX - num);
 
 	switch (cmd) {
 	case KVM_INJECT_EDGE_IRQ_LINE:
-		kvm_arm_irq_line(vm, intid, 1);
-		kvm_arm_irq_line(vm, intid, 0);
+		for (i = intid; i < intid + num; i++)
+			kvm_arm_irq_line(vm, i, 1);
+		for (i = intid; i < intid + num; i++)
+			kvm_arm_irq_line(vm, i, 0);
 		break;
 	default:
 		break;
@@ -222,21 +266,35 @@ static void kvm_inject_get_call(struct kvm_vm *vm, struct ucall *uc,
 	memcpy(args, kvm_args_hva, sizeof(struct kvm_inject_args));
 }
 
+static void print_args(struct test_args *args)
+{
+	printf("nr-irqs=%d\n", args->nr_irqs);
+}
 
-static void test_vgic(void)
+static void test_vgic(uint32_t nr_irqs)
 {
 	struct ucall uc;
 	int gic_fd;
 	struct kvm_vm *vm;
 	struct kvm_inject_args inject_args;
 
+	struct test_args args = {
+		.nr_irqs = nr_irqs,
+	};
+
+	print_args(&args);
+
 	vm = vm_create_default(VCPU_ID, 0, guest_code);
 	ucall_init(vm, NULL);
 
 	vm_init_descriptor_tables(vm);
 	vcpu_init_descriptor_tables(vm, VCPU_ID);
 
-	gic_fd = vgic_v3_setup(vm, 1, GICD_BASE_GPA, GICR_BASE_GPA);
+	/* Setup the guest args page (so it gets the args). */
+	vcpu_args_set(vm, 0, 1, args);
+
+	gic_fd = vgic_v3_setup(vm, 1, nr_irqs,
+			GICD_BASE_GPA, GICR_BASE_GPA);
 
 	vm_install_exception_handler(vm, VECTOR_IRQ_CURRENT,
 			guest_irq_handler);
@@ -247,7 +305,7 @@ static void test_vgic(void)
 		switch (get_ucall(vm, VCPU_ID, &uc)) {
 		case UCALL_SYNC:
 			kvm_inject_get_call(vm, &uc, &inject_args);
-			run_guest_cmd(vm, gic_fd, &inject_args);
+			run_guest_cmd(vm, gic_fd, &inject_args, &args);
 			break;
 		case UCALL_ABORT:
 			TEST_FAIL("%s at %s:%ld\n\tvalues: %#lx, %#lx",
@@ -266,12 +324,39 @@ done:
 	kvm_vm_free(vm);
 }
 
-int main(int ac, char **av)
+static void help(const char *name)
 {
+	printf(
+	"\n"
+	"usage: %s [-n num_irqs]\n", name);
+	printf(" -n: specify the number of IRQs to configure the vgic with.\n");
+	puts("");
+	exit(1);
+}
+
+int main(int argc, char **argv)
+{
+	uint32_t nr_irqs = 64;
+	int opt;
+
 	/* Tell stdout not to buffer its content */
 	setbuf(stdout, NULL);
 
-	test_vgic();
+	while ((opt = getopt(argc, argv, "hg:n:")) != -1) {
+		switch (opt) {
+		case 'n':
+			nr_irqs = atoi(optarg);
+			if (nr_irqs > 1024 || nr_irqs % 32)
+				help(argv[0]);
+			break;
+		case 'h':
+		default:
+			help(argv[0]);
+			break;
+		}
+	}
+
+	test_vgic(nr_irqs);
 
 	return 0;
 }
diff --git a/tools/testing/selftests/kvm/include/aarch64/vgic.h b/tools/testing/selftests/kvm/include/aarch64/vgic.h
index ec8744bb2d4b..ce6f0383c1a1 100644
--- a/tools/testing/selftests/kvm/include/aarch64/vgic.h
+++ b/tools/testing/selftests/kvm/include/aarch64/vgic.h
@@ -14,7 +14,7 @@
 	((uint64_t)(flags) << 12) | \
 	index)
 
-int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus,
+int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs,
 		uint64_t gicd_base_gpa, uint64_t gicr_base_gpa);
 
 #define VGIC_MAX_RESERVED	1023
diff --git a/tools/testing/selftests/kvm/lib/aarch64/vgic.c b/tools/testing/selftests/kvm/lib/aarch64/vgic.c
index a1f1f6c8e2e0..84206d7c92b4 100644
--- a/tools/testing/selftests/kvm/lib/aarch64/vgic.c
+++ b/tools/testing/selftests/kvm/lib/aarch64/vgic.c
@@ -31,7 +31,7 @@
  * redistributor regions of the guest. Since it depends on the number of
  * vCPUs for the VM, it must be called after all the vCPUs have been created.
  */
-int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus,
+int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs,
 		uint64_t gicd_base_gpa, uint64_t gicr_base_gpa)
 {
 	int gic_fd;
@@ -53,6 +53,13 @@ int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus,
 
 	/* Distributor setup */
 	gic_fd = kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_V3, false);
+
+	kvm_device_access(gic_fd, KVM_DEV_ARM_VGIC_GRP_NR_IRQS,
+			0, &nr_irqs, true);
+
+	kvm_device_access(gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
+			KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true);
+
 	kvm_device_access(gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
 			KVM_VGIC_V3_ADDR_TYPE_DIST, &gicd_base_gpa, true);
 	nr_gic_pages = vm_calc_num_guest_pages(vm->mode, KVM_VGIC_V3_DIST_SIZE);
-- 
cgit v1.2.3


From 8a35b2877d9a15fa885cea744f1e578e035856fe Mon Sep 17 00:00:00 2001
From: Ricardo Koller <ricarkol@google.com>
Date: Mon, 8 Nov 2021 18:38:58 -0800
Subject: KVM: selftests: aarch64: Cmdline arg to set EOI mode in vgic_irq

Add a new cmdline arg to set the EOI mode for all vgic_irq tests.  This
specifies whether a write to EOIR will deactivate IRQs or not.

Signed-off-by: Ricardo Koller <ricarkol@google.com>
Acked-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20211109023906.1091208-10-ricarkol@google.com
---
 tools/testing/selftests/kvm/aarch64/vgic_irq.c | 58 ++++++++++++++++++++++----
 1 file changed, 50 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c
index 0b89a29dfe79..3e18fa224280 100644
--- a/tools/testing/selftests/kvm/aarch64/vgic_irq.c
+++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c
@@ -29,6 +29,7 @@
  */
 struct test_args {
 	uint32_t nr_irqs; /* number of KVM supported IRQs. */
+	bool eoi_split; /* 1 is eoir+dir, 0 is eoir only */
 };
 
 /*
@@ -112,7 +113,7 @@ static uint64_t gic_read_ap1r0(void)
 	return reg;
 }
 
-static void guest_irq_handler(struct ex_regs *regs)
+static void guest_irq_generic_handler(bool eoi_split)
 {
 	uint32_t intid = gic_get_and_ack_irq();
 
@@ -129,6 +130,8 @@ static void guest_irq_handler(struct ex_regs *regs)
 
 	gic_set_eoi(intid);
 	GUEST_ASSERT_EQ(gic_read_ap1r0(), 0);
+	if (eoi_split)
+		gic_set_dir(intid);
 
 	GUEST_ASSERT(!gic_irq_get_active(intid));
 	GUEST_ASSERT(!gic_irq_get_pending(intid));
@@ -151,6 +154,24 @@ do { 										\
 	GUEST_ASSERT(_intid == 0 || _intid == IAR_SPURIOUS);			\
 } while (0)
 
+#define CAT_HELPER(a, b) a ## b
+#define CAT(a, b) CAT_HELPER(a, b)
+#define PREFIX guest_irq_handler_
+#define GUEST_IRQ_HANDLER_NAME(split) CAT(PREFIX, split)
+#define GENERATE_GUEST_IRQ_HANDLER(split)					\
+static void CAT(PREFIX, split)(struct ex_regs *regs)				\
+{										\
+	guest_irq_generic_handler(split);					\
+}
+
+GENERATE_GUEST_IRQ_HANDLER(0);
+GENERATE_GUEST_IRQ_HANDLER(1);
+
+static void (*guest_irq_handlers[2])(struct ex_regs *) = {
+	GUEST_IRQ_HANDLER_NAME(0),
+	GUEST_IRQ_HANDLER_NAME(1),
+};
+
 static void reset_priorities(struct test_args *args)
 {
 	int i;
@@ -220,6 +241,8 @@ static void guest_code(struct test_args args)
 	for (i = 0; i < nr_irqs; i++)
 		gic_irq_enable(i);
 
+	gic_set_eoi_split(args.eoi_split);
+
 	reset_priorities(&args);
 	gic_set_priority_mask(CPU_PRIO_MASK);
 
@@ -268,10 +291,11 @@ static void kvm_inject_get_call(struct kvm_vm *vm, struct ucall *uc,
 
 static void print_args(struct test_args *args)
 {
-	printf("nr-irqs=%d\n", args->nr_irqs);
+	printf("nr-irqs=%d eoi-split=%d\n",
+			args->nr_irqs, args->eoi_split);
 }
 
-static void test_vgic(uint32_t nr_irqs)
+static void test_vgic(uint32_t nr_irqs, bool eoi_split)
 {
 	struct ucall uc;
 	int gic_fd;
@@ -280,6 +304,7 @@ static void test_vgic(uint32_t nr_irqs)
 
 	struct test_args args = {
 		.nr_irqs = nr_irqs,
+		.eoi_split = eoi_split,
 	};
 
 	print_args(&args);
@@ -297,7 +322,7 @@ static void test_vgic(uint32_t nr_irqs)
 			GICD_BASE_GPA, GICR_BASE_GPA);
 
 	vm_install_exception_handler(vm, VECTOR_IRQ_CURRENT,
-			guest_irq_handler);
+			guest_irq_handlers[args.eoi_split]);
 
 	while (1) {
 		vcpu_run(vm, VCPU_ID);
@@ -328,8 +353,11 @@ static void help(const char *name)
 {
 	printf(
 	"\n"
-	"usage: %s [-n num_irqs]\n", name);
-	printf(" -n: specify the number of IRQs to configure the vgic with.\n");
+	"usage: %s [-n num_irqs] [-e eoi_split]\n", name);
+	printf(" -n: specify the number of IRQs to configure the vgic with. "
+		"It has to be a multiple of 32 and between 64 and 1024.\n");
+	printf(" -e: if 1 then EOI is split into a write to DIR on top "
+		"of writing EOI.\n");
 	puts("");
 	exit(1);
 }
@@ -337,18 +365,24 @@ static void help(const char *name)
 int main(int argc, char **argv)
 {
 	uint32_t nr_irqs = 64;
+	bool default_args = true;
 	int opt;
+	bool eoi_split = false;
 
 	/* Tell stdout not to buffer its content */
 	setbuf(stdout, NULL);
 
-	while ((opt = getopt(argc, argv, "hg:n:")) != -1) {
+	while ((opt = getopt(argc, argv, "hn:e:")) != -1) {
 		switch (opt) {
 		case 'n':
 			nr_irqs = atoi(optarg);
 			if (nr_irqs > 1024 || nr_irqs % 32)
 				help(argv[0]);
 			break;
+		case 'e':
+			eoi_split = (bool)atoi(optarg);
+			default_args = false;
+			break;
 		case 'h':
 		default:
 			help(argv[0]);
@@ -356,7 +390,15 @@ int main(int argc, char **argv)
 		}
 	}
 
-	test_vgic(nr_irqs);
+	/* If the user just specified nr_irqs and/or gic_version, then run all
+	 * combinations.
+	 */
+	if (default_args) {
+		test_vgic(nr_irqs, false /* eoi_split */);
+		test_vgic(nr_irqs, true /* eoi_split */);
+	} else {
+		test_vgic(nr_irqs, eoi_split);
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3


From 0ad3ff4a6adc4922808ef8b2f91880c25195f509 Mon Sep 17 00:00:00 2001
From: Ricardo Koller <ricarkol@google.com>
Date: Mon, 8 Nov 2021 18:38:59 -0800
Subject: KVM: selftests: aarch64: Add preemption tests in vgic_irq

Add tests for IRQ preemption (having more than one activated IRQ at the
same time).  This test injects multiple concurrent IRQs and handles them
without handling the actual exceptions.  This is done by masking
interrupts for the whole test.

Signed-off-by: Ricardo Koller <ricarkol@google.com>
Acked-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20211109023906.1091208-11-ricarkol@google.com
---
 tools/testing/selftests/kvm/aarch64/vgic_irq.c | 91 +++++++++++++++++++++++++-
 1 file changed, 90 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c
index 3e18fa224280..b9080aa75a14 100644
--- a/tools/testing/selftests/kvm/aarch64/vgic_irq.c
+++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c
@@ -41,6 +41,7 @@ struct test_args {
  */
 #define KVM_NUM_PRIOS		32
 #define KVM_PRIO_SHIFT		3 /* steps of 8 = 1 << 3 */
+#define KVM_PRIO_STEPS		(1 << KVM_PRIO_SHIFT) /* 8 */
 #define LOWEST_PRIO		(KVM_NUM_PRIOS - 1)
 #define CPU_PRIO_MASK		(LOWEST_PRIO << KVM_PRIO_SHIFT)	/* 0xf8 */
 #define IRQ_DEFAULT_PRIO	(LOWEST_PRIO - 1)
@@ -212,6 +213,74 @@ static void guest_inject(struct test_args *args,
 	reset_priorities(args);
 }
 
+/*
+ * Polls the IAR until it's not a spurious interrupt.
+ *
+ * This function should only be used in test_inject_preemption (with IRQs
+ * masked).
+ */
+static uint32_t wait_for_and_activate_irq(void)
+{
+	uint32_t intid;
+
+	do {
+		asm volatile("wfi" : : : "memory");
+		intid = gic_get_and_ack_irq();
+	} while (intid == IAR_SPURIOUS);
+
+	return intid;
+}
+
+/*
+ * Inject multiple concurrent IRQs (num IRQs starting at first_intid) and
+ * handle them without handling the actual exceptions.  This is done by masking
+ * interrupts for the whole test.
+ */
+static void test_inject_preemption(struct test_args *args,
+		uint32_t first_intid, int num,
+		kvm_inject_cmd cmd)
+{
+	uint32_t intid, prio, step = KVM_PRIO_STEPS;
+	int i;
+
+	/* Set the priorities of the first (KVM_NUM_PRIOS - 1) IRQs
+	 * in descending order, so intid+1 can preempt intid.
+	 */
+	for (i = 0, prio = (num - 1) * step; i < num; i++, prio -= step) {
+		GUEST_ASSERT(prio >= 0);
+		intid = i + first_intid;
+		gic_set_priority(intid, prio);
+	}
+
+	local_irq_disable();
+
+	for (i = 0; i < num; i++) {
+		uint32_t tmp;
+		intid = i + first_intid;
+		kvm_inject_call(cmd, intid, 1);
+		/* Each successive IRQ will preempt the previous one. */
+		tmp = wait_for_and_activate_irq();
+		GUEST_ASSERT_EQ(tmp, intid);
+	}
+
+	/* finish handling the IRQs starting with the highest priority one. */
+	for (i = 0; i < num; i++) {
+		intid = num - i - 1 + first_intid;
+		gic_set_eoi(intid);
+		if (args->eoi_split)
+			gic_set_dir(intid);
+	}
+
+	local_irq_enable();
+
+	for (i = 0; i < num; i++)
+		GUEST_ASSERT(!gic_irq_get_active(i + first_intid));
+	GUEST_ASSERT_EQ(gic_read_ap1r0(), 0);
+	GUEST_ASSERT_IAR_EMPTY();
+
+	reset_priorities(args);
+}
+
 static void test_injection(struct test_args *args, struct kvm_inject_desc *f)
 {
 	uint32_t nr_irqs = args->nr_irqs;
@@ -231,6 +300,24 @@ static void test_injection(struct test_args *args, struct kvm_inject_desc *f)
 	}
 }
 
+static void test_preemption(struct test_args *args, struct kvm_inject_desc *f)
+{
+	/*
+	 * Test up to 4 levels of preemption. The reason is that KVM doesn't
+	 * currently implement the ability to have more than the number-of-LRs
+	 * number of concurrently active IRQs. The number of LRs implemented is
+	 * IMPLEMENTATION DEFINED, however, it seems that most implement 4.
+	 */
+	if (f->sgi)
+		test_inject_preemption(args, MIN_SGI, 4, f->cmd);
+
+	if (f->ppi)
+		test_inject_preemption(args, MIN_PPI, 4, f->cmd);
+
+	if (f->spi)
+		test_inject_preemption(args, MIN_SPI, 4, f->cmd);
+}
+
 static void guest_code(struct test_args args)
 {
 	uint32_t i, nr_irqs = args.nr_irqs;
@@ -249,8 +336,10 @@ static void guest_code(struct test_args args)
 	local_irq_enable();
 
 	/* Start the tests. */
-	for_each_inject_fn(inject_edge_fns, f)
+	for_each_inject_fn(inject_edge_fns, f) {
 		test_injection(&args, f);
+		test_preemption(&args, f);
+	}
 
 	GUEST_DONE();
 }
-- 
cgit v1.2.3


From 92f2cc4aa7964d4d13681eeb38582bb989b01b98 Mon Sep 17 00:00:00 2001
From: Ricardo Koller <ricarkol@google.com>
Date: Mon, 8 Nov 2021 18:39:00 -0800
Subject: KVM: selftests: aarch64: Level-sensitive interrupts tests in vgic_irq

Add a cmdline arg for using level-sensitive interrupts (vs the default
edge-triggered). Then move the handler into a generic handler function
that takes the type of interrupt (level vs. edge) as an arg.  When
handling line-sensitive interrupts it sets the line to low after
acknowledging the IRQ.

Signed-off-by: Ricardo Koller <ricarkol@google.com>
Acked-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20211109023906.1091208-12-ricarkol@google.com
---
 tools/testing/selftests/kvm/aarch64/vgic_irq.c | 118 ++++++++++++++++++-------
 1 file changed, 86 insertions(+), 32 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c
index b9080aa75a14..a20d225367a1 100644
--- a/tools/testing/selftests/kvm/aarch64/vgic_irq.c
+++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c
@@ -30,6 +30,7 @@
 struct test_args {
 	uint32_t nr_irqs; /* number of KVM supported IRQs. */
 	bool eoi_split; /* 1 is eoir+dir, 0 is eoir only */
+	bool level_sensitive; /* 1 is level, 0 is edge */
 };
 
 /*
@@ -57,27 +58,31 @@ static void *redist = (void *)GICR_BASE_GPA;
 
 typedef enum {
 	KVM_INJECT_EDGE_IRQ_LINE = 1,
+	KVM_SET_IRQ_LINE,
+	KVM_SET_IRQ_LINE_HIGH,
 } kvm_inject_cmd;
 
 struct kvm_inject_args {
 	kvm_inject_cmd cmd;
 	uint32_t first_intid;
 	uint32_t num;
+	int level;
 };
 
 /* Used on the guest side to perform the hypercall. */
-static void kvm_inject_call(kvm_inject_cmd cmd, uint32_t first_intid, uint32_t num);
-
-#define KVM_INJECT(cmd, intid)							\
-	kvm_inject_call(cmd, intid, 1)
-
-#define KVM_INJECT_MULTI(cmd, intid, num)					\
-	kvm_inject_call(cmd, intid, num)
+static void kvm_inject_call(kvm_inject_cmd cmd, uint32_t first_intid,
+			uint32_t num, int level);
 
 /* Used on the host side to get the hypercall info. */
 static void kvm_inject_get_call(struct kvm_vm *vm, struct ucall *uc,
 		struct kvm_inject_args *args);
 
+#define KVM_INJECT(cmd, intid)							\
+	kvm_inject_call(cmd, intid, 1, -1 /* not used */)
+
+#define KVM_INJECT_MULTI(cmd, intid, num)					\
+	kvm_inject_call(cmd, intid, num, -1 /* not used */)
+
 struct kvm_inject_desc {
 	kvm_inject_cmd cmd;
 	/* can inject PPIs, PPIs, and/or SPIs. */
@@ -90,6 +95,12 @@ static struct kvm_inject_desc inject_edge_fns[] = {
 	{ 0, },
 };
 
+static struct kvm_inject_desc inject_level_fns[] = {
+	/*                                      sgi    ppi    spi */
+	{ KVM_SET_IRQ_LINE_HIGH,		false, true,  true },
+	{ 0, },
+};
+
 #define for_each_inject_fn(t, f)						\
 	for ((f) = (t); (f)->cmd; (f)++)
 
@@ -114,7 +125,9 @@ static uint64_t gic_read_ap1r0(void)
 	return reg;
 }
 
-static void guest_irq_generic_handler(bool eoi_split)
+static void guest_set_irq_line(uint32_t intid, uint32_t level);
+
+static void guest_irq_generic_handler(bool eoi_split, bool level_sensitive)
 {
 	uint32_t intid = gic_get_and_ack_irq();
 
@@ -123,7 +136,11 @@ static void guest_irq_generic_handler(bool eoi_split)
 
 	GUEST_ASSERT(gic_irq_get_active(intid));
 
-	GUEST_ASSERT(!gic_irq_get_pending(intid));
+	if (!level_sensitive)
+		GUEST_ASSERT(!gic_irq_get_pending(intid));
+
+	if (level_sensitive)
+		guest_set_irq_line(intid, 0);
 
 	GUEST_ASSERT(intid < MAX_SPI);
 	irqnr_received[intid] += 1;
@@ -138,12 +155,14 @@ static void guest_irq_generic_handler(bool eoi_split)
 	GUEST_ASSERT(!gic_irq_get_pending(intid));
 }
 
-static void kvm_inject_call(kvm_inject_cmd cmd, uint32_t first_intid, uint32_t num)
+static void kvm_inject_call(kvm_inject_cmd cmd, uint32_t first_intid,
+			uint32_t num, int level)
 {
 	struct kvm_inject_args args = {
 		.cmd = cmd,
 		.first_intid = first_intid,
 		.num = num,
+		.level = level,
 	};
 	GUEST_SYNC(&args);
 }
@@ -158,19 +177,21 @@ do { 										\
 #define CAT_HELPER(a, b) a ## b
 #define CAT(a, b) CAT_HELPER(a, b)
 #define PREFIX guest_irq_handler_
-#define GUEST_IRQ_HANDLER_NAME(split) CAT(PREFIX, split)
-#define GENERATE_GUEST_IRQ_HANDLER(split)					\
-static void CAT(PREFIX, split)(struct ex_regs *regs)				\
+#define GUEST_IRQ_HANDLER_NAME(split, lev) CAT(PREFIX, CAT(split, lev))
+#define GENERATE_GUEST_IRQ_HANDLER(split, lev)					\
+static void CAT(PREFIX, CAT(split, lev))(struct ex_regs *regs)			\
 {										\
-	guest_irq_generic_handler(split);					\
+	guest_irq_generic_handler(split, lev);					\
 }
 
-GENERATE_GUEST_IRQ_HANDLER(0);
-GENERATE_GUEST_IRQ_HANDLER(1);
+GENERATE_GUEST_IRQ_HANDLER(0, 0);
+GENERATE_GUEST_IRQ_HANDLER(0, 1);
+GENERATE_GUEST_IRQ_HANDLER(1, 0);
+GENERATE_GUEST_IRQ_HANDLER(1, 1);
 
-static void (*guest_irq_handlers[2])(struct ex_regs *) = {
-	GUEST_IRQ_HANDLER_NAME(0),
-	GUEST_IRQ_HANDLER_NAME(1),
+static void (*guest_irq_handlers[2][2])(struct ex_regs *) = {
+	{GUEST_IRQ_HANDLER_NAME(0, 0), GUEST_IRQ_HANDLER_NAME(0, 1),},
+	{GUEST_IRQ_HANDLER_NAME(1, 0), GUEST_IRQ_HANDLER_NAME(1, 1),},
 };
 
 static void reset_priorities(struct test_args *args)
@@ -181,6 +202,11 @@ static void reset_priorities(struct test_args *args)
 		gic_set_priority(i, IRQ_DEFAULT_PRIO_REG);
 }
 
+static void guest_set_irq_line(uint32_t intid, uint32_t level)
+{
+	kvm_inject_call(KVM_SET_IRQ_LINE, intid, 1, level);
+}
+
 static void guest_inject(struct test_args *args,
 		uint32_t first_intid, uint32_t num,
 		kvm_inject_cmd cmd)
@@ -257,10 +283,12 @@ static void test_inject_preemption(struct test_args *args,
 	for (i = 0; i < num; i++) {
 		uint32_t tmp;
 		intid = i + first_intid;
-		kvm_inject_call(cmd, intid, 1);
+		KVM_INJECT(cmd, intid);
 		/* Each successive IRQ will preempt the previous one. */
 		tmp = wait_for_and_activate_irq();
 		GUEST_ASSERT_EQ(tmp, intid);
+		if (args->level_sensitive)
+			guest_set_irq_line(intid, 0);
 	}
 
 	/* finish handling the IRQs starting with the highest priority one. */
@@ -321,22 +349,29 @@ static void test_preemption(struct test_args *args, struct kvm_inject_desc *f)
 static void guest_code(struct test_args args)
 {
 	uint32_t i, nr_irqs = args.nr_irqs;
-	struct kvm_inject_desc *f;
+	bool level_sensitive = args.level_sensitive;
+	struct kvm_inject_desc *f, *inject_fns;
 
 	gic_init(GIC_V3, 1, dist, redist);
 
 	for (i = 0; i < nr_irqs; i++)
 		gic_irq_enable(i);
 
+	for (i = MIN_SPI; i < nr_irqs; i++)
+		gic_irq_set_config(i, !args.level_sensitive);
+
 	gic_set_eoi_split(args.eoi_split);
 
 	reset_priorities(&args);
 	gic_set_priority_mask(CPU_PRIO_MASK);
 
+	inject_fns  = level_sensitive ? inject_level_fns
+				      : inject_edge_fns;
+
 	local_irq_enable();
 
 	/* Start the tests. */
-	for_each_inject_fn(inject_edge_fns, f) {
+	for_each_inject_fn(inject_fns, f) {
 		test_injection(&args, f);
 		test_preemption(&args, f);
 	}
@@ -351,6 +386,7 @@ static void run_guest_cmd(struct kvm_vm *vm, int gic_fd,
 	kvm_inject_cmd cmd = inject_args->cmd;
 	uint32_t intid = inject_args->first_intid;
 	uint32_t num = inject_args->num;
+	int level = inject_args->level;
 	uint32_t i;
 
 	assert(intid < UINT_MAX - num);
@@ -362,6 +398,14 @@ static void run_guest_cmd(struct kvm_vm *vm, int gic_fd,
 		for (i = intid; i < intid + num; i++)
 			kvm_arm_irq_line(vm, i, 0);
 		break;
+	case KVM_SET_IRQ_LINE:
+		for (i = intid; i < intid + num; i++)
+			kvm_arm_irq_line(vm, i, level);
+		break;
+	case KVM_SET_IRQ_LINE_HIGH:
+		for (i = intid; i < intid + num; i++)
+			kvm_arm_irq_line(vm, i, 1);
+		break;
 	default:
 		break;
 	}
@@ -380,11 +424,12 @@ static void kvm_inject_get_call(struct kvm_vm *vm, struct ucall *uc,
 
 static void print_args(struct test_args *args)
 {
-	printf("nr-irqs=%d eoi-split=%d\n",
-			args->nr_irqs, args->eoi_split);
+	printf("nr-irqs=%d level-sensitive=%d eoi-split=%d\n",
+			args->nr_irqs, args->level_sensitive,
+			args->eoi_split);
 }
 
-static void test_vgic(uint32_t nr_irqs, bool eoi_split)
+static void test_vgic(uint32_t nr_irqs, bool level_sensitive, bool eoi_split)
 {
 	struct ucall uc;
 	int gic_fd;
@@ -393,6 +438,7 @@ static void test_vgic(uint32_t nr_irqs, bool eoi_split)
 
 	struct test_args args = {
 		.nr_irqs = nr_irqs,
+		.level_sensitive = level_sensitive,
 		.eoi_split = eoi_split,
 	};
 
@@ -411,7 +457,7 @@ static void test_vgic(uint32_t nr_irqs, bool eoi_split)
 			GICD_BASE_GPA, GICR_BASE_GPA);
 
 	vm_install_exception_handler(vm, VECTOR_IRQ_CURRENT,
-			guest_irq_handlers[args.eoi_split]);
+		guest_irq_handlers[args.eoi_split][args.level_sensitive]);
 
 	while (1) {
 		vcpu_run(vm, VCPU_ID);
@@ -442,11 +488,12 @@ static void help(const char *name)
 {
 	printf(
 	"\n"
-	"usage: %s [-n num_irqs] [-e eoi_split]\n", name);
-	printf(" -n: specify the number of IRQs to configure the vgic with. "
+	"usage: %s [-n num_irqs] [-e eoi_split] [-l level_sensitive]\n", name);
+	printf(" -n: specify number of IRQs to setup the vgic with. "
 		"It has to be a multiple of 32 and between 64 and 1024.\n");
 	printf(" -e: if 1 then EOI is split into a write to DIR on top "
 		"of writing EOI.\n");
+	printf(" -l: specify whether the IRQs are level-sensitive (1) or not (0).");
 	puts("");
 	exit(1);
 }
@@ -455,13 +502,14 @@ int main(int argc, char **argv)
 {
 	uint32_t nr_irqs = 64;
 	bool default_args = true;
+	bool level_sensitive = false;
 	int opt;
 	bool eoi_split = false;
 
 	/* Tell stdout not to buffer its content */
 	setbuf(stdout, NULL);
 
-	while ((opt = getopt(argc, argv, "hn:e:")) != -1) {
+	while ((opt = getopt(argc, argv, "hn:e:l:")) != -1) {
 		switch (opt) {
 		case 'n':
 			nr_irqs = atoi(optarg);
@@ -472,6 +520,10 @@ int main(int argc, char **argv)
 			eoi_split = (bool)atoi(optarg);
 			default_args = false;
 			break;
+		case 'l':
+			level_sensitive = (bool)atoi(optarg);
+			default_args = false;
+			break;
 		case 'h':
 		default:
 			help(argv[0]);
@@ -483,10 +535,12 @@ int main(int argc, char **argv)
 	 * combinations.
 	 */
 	if (default_args) {
-		test_vgic(nr_irqs, false /* eoi_split */);
-		test_vgic(nr_irqs, true /* eoi_split */);
+		test_vgic(nr_irqs, false /* level */, false /* eoi_split */);
+		test_vgic(nr_irqs, false /* level */, true /* eoi_split */);
+		test_vgic(nr_irqs, true /* level */, false /* eoi_split */);
+		test_vgic(nr_irqs, true /* level */, true /* eoi_split */);
 	} else {
-		test_vgic(nr_irqs, eoi_split);
+		test_vgic(nr_irqs, level_sensitive, eoi_split);
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 6830fa915912587a7aa304bade01b366cf0b9214 Mon Sep 17 00:00:00 2001
From: Ricardo Koller <ricarkol@google.com>
Date: Mon, 8 Nov 2021 18:39:01 -0800
Subject: KVM: selftests: aarch64: Add tests for LEVEL_INFO in vgic_irq

Add injection tests for the LEVEL_INFO ioctl (level-sensitive specific)
into vgic_irq.

Signed-off-by: Ricardo Koller <ricarkol@google.com>
Acked-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20211109023906.1091208-13-ricarkol@google.com
---
 tools/testing/selftests/kvm/aarch64/vgic_irq.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c
index a20d225367a1..bc1b6fd684fc 100644
--- a/tools/testing/selftests/kvm/aarch64/vgic_irq.c
+++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c
@@ -60,6 +60,7 @@ typedef enum {
 	KVM_INJECT_EDGE_IRQ_LINE = 1,
 	KVM_SET_IRQ_LINE,
 	KVM_SET_IRQ_LINE_HIGH,
+	KVM_SET_LEVEL_INFO_HIGH,
 } kvm_inject_cmd;
 
 struct kvm_inject_args {
@@ -98,6 +99,7 @@ static struct kvm_inject_desc inject_edge_fns[] = {
 static struct kvm_inject_desc inject_level_fns[] = {
 	/*                                      sgi    ppi    spi */
 	{ KVM_SET_IRQ_LINE_HIGH,		false, true,  true },
+	{ KVM_SET_LEVEL_INFO_HIGH,		false, true,  true },
 	{ 0, },
 };
 
@@ -406,6 +408,10 @@ static void run_guest_cmd(struct kvm_vm *vm, int gic_fd,
 		for (i = intid; i < intid + num; i++)
 			kvm_arm_irq_line(vm, i, 1);
 		break;
+	case KVM_SET_LEVEL_INFO_HIGH:
+		for (i = intid; i < intid + num; i++)
+			kvm_irq_set_level_info(gic_fd, i, 1);
+		break;
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From 90f50acac9ee9f81192098c22b2cbf2491a40263 Mon Sep 17 00:00:00 2001
From: Ricardo Koller <ricarkol@google.com>
Date: Mon, 8 Nov 2021 18:39:02 -0800
Subject: KVM: selftests: aarch64: Add test_inject_fail to vgic_irq

Add tests for failed injections to vgic_irq. This tests that KVM can
handle bogus IRQ numbers.

Signed-off-by: Ricardo Koller <ricarkol@google.com>
Acked-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20211109023906.1091208-14-ricarkol@google.com
---
 tools/testing/selftests/kvm/aarch64/vgic_irq.c | 122 +++++++++++++++++++++----
 tools/testing/selftests/kvm/lib/aarch64/vgic.c |   7 +-
 2 files changed, 109 insertions(+), 20 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c
index bc1b6fd684fc..9f1674b3a45c 100644
--- a/tools/testing/selftests/kvm/aarch64/vgic_irq.c
+++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c
@@ -68,21 +68,28 @@ struct kvm_inject_args {
 	uint32_t first_intid;
 	uint32_t num;
 	int level;
+	bool expect_failure;
 };
 
 /* Used on the guest side to perform the hypercall. */
 static void kvm_inject_call(kvm_inject_cmd cmd, uint32_t first_intid,
-			uint32_t num, int level);
+		uint32_t num, int level, bool expect_failure);
 
 /* Used on the host side to get the hypercall info. */
 static void kvm_inject_get_call(struct kvm_vm *vm, struct ucall *uc,
 		struct kvm_inject_args *args);
 
-#define KVM_INJECT(cmd, intid)							\
-	kvm_inject_call(cmd, intid, 1, -1 /* not used */)
+#define _KVM_INJECT_MULTI(cmd, intid, num, expect_failure)			\
+	kvm_inject_call(cmd, intid, num, -1 /* not used */, expect_failure)
 
 #define KVM_INJECT_MULTI(cmd, intid, num)					\
-	kvm_inject_call(cmd, intid, num, -1 /* not used */)
+	_KVM_INJECT_MULTI(cmd, intid, num, false)
+
+#define _KVM_INJECT(cmd, intid, expect_failure)					\
+	_KVM_INJECT_MULTI(cmd, intid, 1, expect_failure)
+
+#define KVM_INJECT(cmd, intid)							\
+	_KVM_INJECT_MULTI(cmd, intid, 1, false)
 
 struct kvm_inject_desc {
 	kvm_inject_cmd cmd;
@@ -158,13 +165,14 @@ static void guest_irq_generic_handler(bool eoi_split, bool level_sensitive)
 }
 
 static void kvm_inject_call(kvm_inject_cmd cmd, uint32_t first_intid,
-			uint32_t num, int level)
+		uint32_t num, int level, bool expect_failure)
 {
 	struct kvm_inject_args args = {
 		.cmd = cmd,
 		.first_intid = first_intid,
 		.num = num,
 		.level = level,
+		.expect_failure = expect_failure,
 	};
 	GUEST_SYNC(&args);
 }
@@ -206,7 +214,19 @@ static void reset_priorities(struct test_args *args)
 
 static void guest_set_irq_line(uint32_t intid, uint32_t level)
 {
-	kvm_inject_call(KVM_SET_IRQ_LINE, intid, 1, level);
+	kvm_inject_call(KVM_SET_IRQ_LINE, intid, 1, level, false);
+}
+
+static void test_inject_fail(struct test_args *args,
+		uint32_t intid, kvm_inject_cmd cmd)
+{
+	reset_stats();
+
+	_KVM_INJECT(cmd, intid, true);
+	/* no IRQ to handle on entry */
+
+	GUEST_ASSERT_EQ(irq_handled, 0);
+	GUEST_ASSERT_IAR_EMPTY();
 }
 
 static void guest_inject(struct test_args *args,
@@ -330,6 +350,16 @@ static void test_injection(struct test_args *args, struct kvm_inject_desc *f)
 	}
 }
 
+static void test_injection_failure(struct test_args *args,
+		struct kvm_inject_desc *f)
+{
+	uint32_t bad_intid[] = { args->nr_irqs, 1020, 1024, 1120, 5120, ~0U, };
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(bad_intid); i++)
+		test_inject_fail(args, bad_intid[i], f->cmd);
+}
+
 static void test_preemption(struct test_args *args, struct kvm_inject_desc *f)
 {
 	/*
@@ -376,11 +406,61 @@ static void guest_code(struct test_args args)
 	for_each_inject_fn(inject_fns, f) {
 		test_injection(&args, f);
 		test_preemption(&args, f);
+		test_injection_failure(&args, f);
 	}
 
 	GUEST_DONE();
 }
 
+static void kvm_irq_line_check(struct kvm_vm *vm, uint32_t intid, int level,
+			struct test_args *test_args, bool expect_failure)
+{
+	int ret;
+
+	if (!expect_failure) {
+		kvm_arm_irq_line(vm, intid, level);
+	} else {
+		/* The interface doesn't allow larger intid's. */
+		if (intid > KVM_ARM_IRQ_NUM_MASK)
+			return;
+
+		ret = _kvm_arm_irq_line(vm, intid, level);
+		TEST_ASSERT(ret != 0 && errno == EINVAL,
+				"Bad intid %i did not cause KVM_IRQ_LINE "
+				"error: rc: %i errno: %i", intid, ret, errno);
+	}
+}
+
+void kvm_irq_set_level_info_check(int gic_fd, uint32_t intid, int level,
+			bool expect_failure)
+{
+	if (!expect_failure) {
+		kvm_irq_set_level_info(gic_fd, intid, level);
+	} else {
+		int ret = _kvm_irq_set_level_info(gic_fd, intid, level);
+		/*
+		 * The kernel silently fails for invalid SPIs and SGIs (which
+		 * are not level-sensitive). It only checks for intid to not
+		 * spill over 1U << 10 (the max reserved SPI). Also, callers
+		 * are supposed to mask the intid with 0x3ff (1023).
+		 */
+		if (intid > VGIC_MAX_RESERVED)
+			TEST_ASSERT(ret != 0 && errno == EINVAL,
+				"Bad intid %i did not cause VGIC_GRP_LEVEL_INFO "
+				"error: rc: %i errno: %i", intid, ret, errno);
+		else
+			TEST_ASSERT(!ret, "KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO "
+				"for intid %i failed, rc: %i errno: %i",
+				intid, ret, errno);
+	}
+}
+
+/* handles the valid case: intid=0xffffffff num=1 */
+#define for_each_intid(first, num, tmp, i)					\
+	for ((tmp) = (i) = (first);						\
+		(tmp) < (uint64_t)(first) + (uint64_t)(num);			\
+		(tmp)++, (i)++)
+
 static void run_guest_cmd(struct kvm_vm *vm, int gic_fd,
 		struct kvm_inject_args *inject_args,
 		struct test_args *test_args)
@@ -389,28 +469,36 @@ static void run_guest_cmd(struct kvm_vm *vm, int gic_fd,
 	uint32_t intid = inject_args->first_intid;
 	uint32_t num = inject_args->num;
 	int level = inject_args->level;
+	bool expect_failure = inject_args->expect_failure;
+	uint64_t tmp;
 	uint32_t i;
 
-	assert(intid < UINT_MAX - num);
+	/* handles the valid case: intid=0xffffffff num=1 */
+	assert(intid < UINT_MAX - num || num == 1);
 
 	switch (cmd) {
 	case KVM_INJECT_EDGE_IRQ_LINE:
-		for (i = intid; i < intid + num; i++)
-			kvm_arm_irq_line(vm, i, 1);
-		for (i = intid; i < intid + num; i++)
-			kvm_arm_irq_line(vm, i, 0);
+		for_each_intid(intid, num, tmp, i)
+			kvm_irq_line_check(vm, i, 1, test_args,
+					expect_failure);
+		for_each_intid(intid, num, tmp, i)
+			kvm_irq_line_check(vm, i, 0, test_args,
+					expect_failure);
 		break;
 	case KVM_SET_IRQ_LINE:
-		for (i = intid; i < intid + num; i++)
-			kvm_arm_irq_line(vm, i, level);
+		for_each_intid(intid, num, tmp, i)
+			kvm_irq_line_check(vm, i, level, test_args,
+					expect_failure);
 		break;
 	case KVM_SET_IRQ_LINE_HIGH:
-		for (i = intid; i < intid + num; i++)
-			kvm_arm_irq_line(vm, i, 1);
+		for_each_intid(intid, num, tmp, i)
+			kvm_irq_line_check(vm, i, 1, test_args,
+					expect_failure);
 		break;
 	case KVM_SET_LEVEL_INFO_HIGH:
-		for (i = intid; i < intid + num; i++)
-			kvm_irq_set_level_info(gic_fd, i, 1);
+		for_each_intid(intid, num, tmp, i)
+			kvm_irq_set_level_info_check(gic_fd, i, 1,
+					expect_failure);
 		break;
 	default:
 		break;
diff --git a/tools/testing/selftests/kvm/lib/aarch64/vgic.c b/tools/testing/selftests/kvm/lib/aarch64/vgic.c
index 84206d7c92b4..b3a0fca0d780 100644
--- a/tools/testing/selftests/kvm/lib/aarch64/vgic.c
+++ b/tools/testing/selftests/kvm/lib/aarch64/vgic.c
@@ -110,12 +110,13 @@ int _kvm_arm_irq_line(struct kvm_vm *vm, uint32_t intid, int level)
 {
 	uint32_t irq = intid & KVM_ARM_IRQ_NUM_MASK;
 
+	TEST_ASSERT(!INTID_IS_SGI(intid), "KVM_IRQ_LINE's interface itself "
+		"doesn't allow injecting SGIs. There's no mask for it.");
+
 	if (INTID_IS_PPI(intid))
 		irq |= KVM_ARM_IRQ_TYPE_PPI << KVM_ARM_IRQ_TYPE_SHIFT;
-	else if (INTID_IS_SPI(intid))
-		irq |= KVM_ARM_IRQ_TYPE_SPI << KVM_ARM_IRQ_TYPE_SHIFT;
 	else
-		TEST_FAIL("KVM_IRQ_LINE can't be used with SGIs.");
+		irq |= KVM_ARM_IRQ_TYPE_SPI << KVM_ARM_IRQ_TYPE_SHIFT;
 
 	return _kvm_irq_line(vm, irq, level);
 }
-- 
cgit v1.2.3


From 88209c104e9b3e95502c0e924fb1cd8bd5a01d82 Mon Sep 17 00:00:00 2001
From: Ricardo Koller <ricarkol@google.com>
Date: Mon, 8 Nov 2021 18:39:03 -0800
Subject: KVM: selftests: Add IRQ GSI routing library functions

Add an architecture independent wrapper function for creating and
writing IRQ GSI routing tables. Also add a function to add irqchip
entries.

Signed-off-by: Ricardo Koller <ricarkol@google.com>
Acked-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20211109023906.1091208-15-ricarkol@google.com
---
 tools/testing/selftests/kvm/include/kvm_util.h |  8 ++++
 tools/testing/selftests/kvm/lib/kvm_util.c     | 51 ++++++++++++++++++++++++++
 2 files changed, 59 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 4ba8fd44cf95..db36e6c49983 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -252,6 +252,14 @@ int _vcpu_access_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group,
 int vcpu_access_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group,
 			 uint64_t attr, void *val, bool write);
 
+#define KVM_MAX_IRQ_ROUTES		4096
+
+struct kvm_irq_routing *kvm_gsi_routing_create(void);
+void kvm_gsi_routing_irqchip_add(struct kvm_irq_routing *routing,
+		uint32_t gsi, uint32_t pin);
+int _kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing);
+void kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing);
+
 const char *exit_reason_str(unsigned int exit_reason);
 
 void virt_pgd_alloc(struct kvm_vm *vm);
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 9f7210c2d75e..68b56e7e6808 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -2107,6 +2107,57 @@ void kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level)
 	TEST_ASSERT(ret >= 0, "KVM_IRQ_LINE failed, rc: %i errno: %i", ret, errno);
 }
 
+struct kvm_irq_routing *kvm_gsi_routing_create(void)
+{
+	struct kvm_irq_routing *routing;
+	size_t size;
+
+	size = sizeof(struct kvm_irq_routing);
+	/* Allocate space for the max number of entries: this wastes 196 KBs. */
+	size += KVM_MAX_IRQ_ROUTES * sizeof(struct kvm_irq_routing_entry);
+	routing = calloc(1, size);
+	assert(routing);
+
+	return routing;
+}
+
+void kvm_gsi_routing_irqchip_add(struct kvm_irq_routing *routing,
+		uint32_t gsi, uint32_t pin)
+{
+	int i;
+
+	assert(routing);
+	assert(routing->nr < KVM_MAX_IRQ_ROUTES);
+
+	i = routing->nr;
+	routing->entries[i].gsi = gsi;
+	routing->entries[i].type = KVM_IRQ_ROUTING_IRQCHIP;
+	routing->entries[i].flags = 0;
+	routing->entries[i].u.irqchip.irqchip = 0;
+	routing->entries[i].u.irqchip.pin = pin;
+	routing->nr++;
+}
+
+int _kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing)
+{
+	int ret;
+
+	assert(routing);
+	ret = ioctl(vm_get_fd(vm), KVM_SET_GSI_ROUTING, routing);
+	free(routing);
+
+	return ret;
+}
+
+void kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing)
+{
+	int ret;
+
+	ret = _kvm_gsi_routing_write(vm, routing);
+	TEST_ASSERT(ret == 0, "KVM_SET_GSI_ROUTING failed, rc: %i errno: %i",
+				ret, errno);
+}
+
 /*
  * VM Dump
  *
-- 
cgit v1.2.3


From 6a5a47188caca7be4bbe28cdb31d5df09868ed5c Mon Sep 17 00:00:00 2001
From: Ricardo Koller <ricarkol@google.com>
Date: Mon, 8 Nov 2021 18:39:04 -0800
Subject: KVM: selftests: aarch64: Add tests for IRQFD in vgic_irq

Add injection tests for the KVM_IRQFD ioctl into vgic_irq.

Signed-off-by: Ricardo Koller <ricarkol@google.com>
Acked-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20211109023906.1091208-16-ricarkol@google.com
---
 tools/testing/selftests/kvm/aarch64/vgic_irq.c     | 101 ++++++++++++++++++++-
 tools/testing/selftests/kvm/include/aarch64/vgic.h |   2 +
 2 files changed, 102 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c
index 9f1674b3a45c..121113f24ed3 100644
--- a/tools/testing/selftests/kvm/aarch64/vgic_irq.c
+++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c
@@ -10,6 +10,7 @@
 
 #include <asm/kvm.h>
 #include <asm/kvm_para.h>
+#include <sys/eventfd.h>
 #include <linux/sizes.h>
 
 #include "processor.h"
@@ -31,6 +32,8 @@ struct test_args {
 	uint32_t nr_irqs; /* number of KVM supported IRQs. */
 	bool eoi_split; /* 1 is eoir+dir, 0 is eoir only */
 	bool level_sensitive; /* 1 is level, 0 is edge */
+	int kvm_max_routes; /* output of KVM_CAP_IRQ_ROUTING */
+	bool kvm_supports_irqfd; /* output of KVM_CAP_IRQFD */
 };
 
 /*
@@ -61,6 +64,7 @@ typedef enum {
 	KVM_SET_IRQ_LINE,
 	KVM_SET_IRQ_LINE_HIGH,
 	KVM_SET_LEVEL_INFO_HIGH,
+	KVM_INJECT_IRQFD,
 } kvm_inject_cmd;
 
 struct kvm_inject_args {
@@ -100,6 +104,7 @@ struct kvm_inject_desc {
 static struct kvm_inject_desc inject_edge_fns[] = {
 	/*                                      sgi    ppi    spi */
 	{ KVM_INJECT_EDGE_IRQ_LINE,		false, false, true },
+	{ KVM_INJECT_IRQFD,			false, false, true },
 	{ 0, },
 };
 
@@ -107,12 +112,17 @@ static struct kvm_inject_desc inject_level_fns[] = {
 	/*                                      sgi    ppi    spi */
 	{ KVM_SET_IRQ_LINE_HIGH,		false, true,  true },
 	{ KVM_SET_LEVEL_INFO_HIGH,		false, true,  true },
+	{ KVM_INJECT_IRQFD,			false, false, true },
 	{ 0, },
 };
 
 #define for_each_inject_fn(t, f)						\
 	for ((f) = (t); (f)->cmd; (f)++)
 
+#define for_each_supported_inject_fn(args, t, f)				\
+	for_each_inject_fn(t, f)						\
+		if ((args)->kvm_supports_irqfd || (f)->cmd != KVM_INJECT_IRQFD)
+
 /* Shared between the guest main thread and the IRQ handlers. */
 volatile uint64_t irq_handled;
 volatile uint32_t irqnr_received[MAX_SPI + 1];
@@ -403,7 +413,7 @@ static void guest_code(struct test_args args)
 	local_irq_enable();
 
 	/* Start the tests. */
-	for_each_inject_fn(inject_fns, f) {
+	for_each_supported_inject_fn(&args, inject_fns, f) {
 		test_injection(&args, f);
 		test_preemption(&args, f);
 		test_injection_failure(&args, f);
@@ -455,6 +465,88 @@ void kvm_irq_set_level_info_check(int gic_fd, uint32_t intid, int level,
 	}
 }
 
+static void kvm_set_gsi_routing_irqchip_check(struct kvm_vm *vm,
+		uint32_t intid, uint32_t num, uint32_t kvm_max_routes,
+		bool expect_failure)
+{
+	struct kvm_irq_routing *routing;
+	int ret;
+	uint64_t i;
+
+	assert(num <= kvm_max_routes && kvm_max_routes <= KVM_MAX_IRQ_ROUTES);
+
+	routing = kvm_gsi_routing_create();
+	for (i = intid; i < (uint64_t)intid + num; i++)
+		kvm_gsi_routing_irqchip_add(routing, i - MIN_SPI, i - MIN_SPI);
+
+	if (!expect_failure) {
+		kvm_gsi_routing_write(vm, routing);
+	} else {
+		ret = _kvm_gsi_routing_write(vm, routing);
+		/* The kernel only checks for KVM_IRQCHIP_NUM_PINS. */
+		if (intid >= KVM_IRQCHIP_NUM_PINS)
+			TEST_ASSERT(ret != 0 && errno == EINVAL,
+				"Bad intid %u did not cause KVM_SET_GSI_ROUTING "
+				"error: rc: %i errno: %i", intid, ret, errno);
+		else
+			TEST_ASSERT(ret == 0, "KVM_SET_GSI_ROUTING "
+				"for intid %i failed, rc: %i errno: %i",
+				intid, ret, errno);
+	}
+}
+
+static void kvm_routing_and_irqfd_check(struct kvm_vm *vm,
+		uint32_t intid, uint32_t num, uint32_t kvm_max_routes,
+		bool expect_failure)
+{
+	int fd[MAX_SPI];
+	uint64_t val;
+	int ret, f;
+	uint64_t i;
+
+	/*
+	 * There is no way to try injecting an SGI or PPI as the interface
+	 * starts counting from the first SPI (above the private ones), so just
+	 * exit.
+	 */
+	if (INTID_IS_SGI(intid) || INTID_IS_PPI(intid))
+		return;
+
+	kvm_set_gsi_routing_irqchip_check(vm, intid, num,
+			kvm_max_routes, expect_failure);
+
+	/*
+	 * If expect_failure, then just to inject anyway. These
+	 * will silently fail. And in any case, the guest will check
+	 * that no actual interrupt was injected for those cases.
+	 */
+
+	for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) {
+		fd[f] = eventfd(0, 0);
+		TEST_ASSERT(fd[f] != -1,
+			"eventfd failed, errno: %i\n", errno);
+	}
+
+	for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) {
+		struct kvm_irqfd irqfd = {
+			.fd  = fd[f],
+			.gsi = i - MIN_SPI,
+		};
+		assert(i <= (uint64_t)UINT_MAX);
+		vm_ioctl(vm, KVM_IRQFD, &irqfd);
+	}
+
+	for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) {
+		val = 1;
+		ret = write(fd[f], &val, sizeof(uint64_t));
+		TEST_ASSERT(ret == sizeof(uint64_t),
+			"Write to KVM_IRQFD failed with ret: %d\n", ret);
+	}
+
+	for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++)
+		close(fd[f]);
+}
+
 /* handles the valid case: intid=0xffffffff num=1 */
 #define for_each_intid(first, num, tmp, i)					\
 	for ((tmp) = (i) = (first);						\
@@ -500,6 +592,11 @@ static void run_guest_cmd(struct kvm_vm *vm, int gic_fd,
 			kvm_irq_set_level_info_check(gic_fd, i, 1,
 					expect_failure);
 		break;
+	case KVM_INJECT_IRQFD:
+		kvm_routing_and_irqfd_check(vm, intid, num,
+					test_args->kvm_max_routes,
+					expect_failure);
+		break;
 	default:
 		break;
 	}
@@ -534,6 +631,8 @@ static void test_vgic(uint32_t nr_irqs, bool level_sensitive, bool eoi_split)
 		.nr_irqs = nr_irqs,
 		.level_sensitive = level_sensitive,
 		.eoi_split = eoi_split,
+		.kvm_max_routes = kvm_check_cap(KVM_CAP_IRQ_ROUTING),
+		.kvm_supports_irqfd = kvm_check_cap(KVM_CAP_IRQFD),
 	};
 
 	print_args(&args);
diff --git a/tools/testing/selftests/kvm/include/aarch64/vgic.h b/tools/testing/selftests/kvm/include/aarch64/vgic.h
index ce6f0383c1a1..4442081221a0 100644
--- a/tools/testing/selftests/kvm/include/aarch64/vgic.h
+++ b/tools/testing/selftests/kvm/include/aarch64/vgic.h
@@ -29,4 +29,6 @@ int _kvm_arm_irq_line(struct kvm_vm *vm, uint32_t intid, int level);
 void kvm_irq_write_ispendr(int gic_fd, uint32_t intid, uint32_t vcpu);
 void kvm_irq_write_isactiver(int gic_fd, uint32_t intid, uint32_t vcpu);
 
+#define KVM_IRQCHIP_NUM_PINS	(1020 - 32)
+
 #endif // SELFTEST_KVM_VGIC_H
-- 
cgit v1.2.3


From bebd8f3f869361e0249efe423ba76a0d991ce3e6 Mon Sep 17 00:00:00 2001
From: Ricardo Koller <ricarkol@google.com>
Date: Mon, 8 Nov 2021 18:39:05 -0800
Subject: KVM: selftests: aarch64: Add ISPENDR write tests in vgic_irq

Add injection tests that use writing into the ISPENDR register (to mark
IRQs as pending). This is typically used by migration code.

Signed-off-by: Ricardo Koller <ricarkol@google.com>
Acked-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20211109023906.1091208-17-ricarkol@google.com
---
 tools/testing/selftests/kvm/aarch64/vgic_irq.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c
index 121113f24ed3..ab39f0bf18e7 100644
--- a/tools/testing/selftests/kvm/aarch64/vgic_irq.c
+++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c
@@ -65,6 +65,7 @@ typedef enum {
 	KVM_SET_IRQ_LINE_HIGH,
 	KVM_SET_LEVEL_INFO_HIGH,
 	KVM_INJECT_IRQFD,
+	KVM_WRITE_ISPENDR,
 } kvm_inject_cmd;
 
 struct kvm_inject_args {
@@ -105,6 +106,7 @@ static struct kvm_inject_desc inject_edge_fns[] = {
 	/*                                      sgi    ppi    spi */
 	{ KVM_INJECT_EDGE_IRQ_LINE,		false, false, true },
 	{ KVM_INJECT_IRQFD,			false, false, true },
+	{ KVM_WRITE_ISPENDR,			true,  false, true },
 	{ 0, },
 };
 
@@ -113,6 +115,7 @@ static struct kvm_inject_desc inject_level_fns[] = {
 	{ KVM_SET_IRQ_LINE_HIGH,		false, true,  true },
 	{ KVM_SET_LEVEL_INFO_HIGH,		false, true,  true },
 	{ KVM_INJECT_IRQFD,			false, false, true },
+	{ KVM_WRITE_ISPENDR,			false, true,  true },
 	{ 0, },
 };
 
@@ -495,6 +498,20 @@ static void kvm_set_gsi_routing_irqchip_check(struct kvm_vm *vm,
 	}
 }
 
+static void kvm_irq_write_ispendr_check(int gic_fd, uint32_t intid,
+			uint32_t vcpu, bool expect_failure)
+{
+	/*
+	 * Ignore this when expecting failure as invalid intids will lead to
+	 * either trying to inject SGIs when we configured the test to be
+	 * level_sensitive (or the reverse), or inject large intids which
+	 * will lead to writing above the ISPENDR register space (and we
+	 * don't want to do that either).
+	 */
+	if (!expect_failure)
+		kvm_irq_write_ispendr(gic_fd, intid, vcpu);
+}
+
 static void kvm_routing_and_irqfd_check(struct kvm_vm *vm,
 		uint32_t intid, uint32_t num, uint32_t kvm_max_routes,
 		bool expect_failure)
@@ -597,6 +614,11 @@ static void run_guest_cmd(struct kvm_vm *vm, int gic_fd,
 					test_args->kvm_max_routes,
 					expect_failure);
 		break;
+	case KVM_WRITE_ISPENDR:
+		for (i = intid; i < intid + num; i++)
+			kvm_irq_write_ispendr_check(gic_fd, i,
+					VCPU_ID, expect_failure);
+		break;
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From 728fcc46d2c2292d1ac73f3491b8f4332066fdad Mon Sep 17 00:00:00 2001
From: Ricardo Koller <ricarkol@google.com>
Date: Mon, 8 Nov 2021 18:39:06 -0800
Subject: KVM: selftests: aarch64: Add test for restoring active IRQs

Add a test that restores multiple IRQs in active state, it does it by
writing into ISACTIVER from the guest and using KVM ioctls. This test
tries to emulate what would happen during a live migration: restore
active IRQs.

Signed-off-by: Ricardo Koller <ricarkol@google.com>
Acked-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20211109023906.1091208-18-ricarkol@google.com
---
 tools/testing/selftests/kvm/aarch64/vgic_irq.c | 91 ++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c
index ab39f0bf18e7..e6c7d7f8fbd1 100644
--- a/tools/testing/selftests/kvm/aarch64/vgic_irq.c
+++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c
@@ -66,6 +66,7 @@ typedef enum {
 	KVM_SET_LEVEL_INFO_HIGH,
 	KVM_INJECT_IRQFD,
 	KVM_WRITE_ISPENDR,
+	KVM_WRITE_ISACTIVER,
 } kvm_inject_cmd;
 
 struct kvm_inject_args {
@@ -96,6 +97,9 @@ static void kvm_inject_get_call(struct kvm_vm *vm, struct ucall *uc,
 #define KVM_INJECT(cmd, intid)							\
 	_KVM_INJECT_MULTI(cmd, intid, 1, false)
 
+#define KVM_ACTIVATE(cmd, intid)						\
+	kvm_inject_call(cmd, intid, 1, 1, false);
+
 struct kvm_inject_desc {
 	kvm_inject_cmd cmd;
 	/* can inject PPIs, PPIs, and/or SPIs. */
@@ -119,6 +123,12 @@ static struct kvm_inject_desc inject_level_fns[] = {
 	{ 0, },
 };
 
+static struct kvm_inject_desc set_active_fns[] = {
+	/*                                      sgi    ppi    spi */
+	{ KVM_WRITE_ISACTIVER,			true,  true,  true },
+	{ 0, },
+};
+
 #define for_each_inject_fn(t, f)						\
 	for ((f) = (t); (f)->cmd; (f)++)
 
@@ -126,6 +136,9 @@ static struct kvm_inject_desc inject_level_fns[] = {
 	for_each_inject_fn(t, f)						\
 		if ((args)->kvm_supports_irqfd || (f)->cmd != KVM_INJECT_IRQFD)
 
+#define for_each_supported_activate_fn(args, t, f)				\
+	for_each_supported_inject_fn((args), (t), (f))
+
 /* Shared between the guest main thread and the IRQ handlers. */
 volatile uint64_t irq_handled;
 volatile uint32_t irqnr_received[MAX_SPI + 1];
@@ -147,6 +160,12 @@ static uint64_t gic_read_ap1r0(void)
 	return reg;
 }
 
+static void gic_write_ap1r0(uint64_t val)
+{
+	write_sysreg_s(val, SYS_ICV_AP1R0_EL1);
+	isb();
+}
+
 static void guest_set_irq_line(uint32_t intid, uint32_t level);
 
 static void guest_irq_generic_handler(bool eoi_split, bool level_sensitive)
@@ -274,6 +293,55 @@ static void guest_inject(struct test_args *args,
 	reset_priorities(args);
 }
 
+/*
+ * Restore the active state of multiple concurrent IRQs (given by
+ * concurrent_irqs).  This does what a live-migration would do on the
+ * destination side assuming there are some active IRQs that were not
+ * deactivated yet.
+ */
+static void guest_restore_active(struct test_args *args,
+		uint32_t first_intid, uint32_t num,
+		kvm_inject_cmd cmd)
+{
+	uint32_t prio, intid, ap1r;
+	int i;
+
+	/* Set the priorities of the first (KVM_NUM_PRIOS - 1) IRQs
+	 * in descending order, so intid+1 can preempt intid.
+	 */
+	for (i = 0, prio = (num - 1) * 8; i < num; i++, prio -= 8) {
+		GUEST_ASSERT(prio >= 0);
+		intid = i + first_intid;
+		gic_set_priority(intid, prio);
+	}
+
+	/* In a real migration, KVM would restore all GIC state before running
+	 * guest code.
+	 */
+	for (i = 0; i < num; i++) {
+		intid = i + first_intid;
+		KVM_ACTIVATE(cmd, intid);
+		ap1r = gic_read_ap1r0();
+		ap1r |= 1U << i;
+		gic_write_ap1r0(ap1r);
+	}
+
+	/* This is where the "migration" would occur. */
+
+	/* finish handling the IRQs starting with the highest priority one. */
+	for (i = 0; i < num; i++) {
+		intid = num - i - 1 + first_intid;
+		gic_set_eoi(intid);
+		if (args->eoi_split)
+			gic_set_dir(intid);
+	}
+
+	for (i = 0; i < num; i++)
+		GUEST_ASSERT(!gic_irq_get_active(i + first_intid));
+	GUEST_ASSERT_EQ(gic_read_ap1r0(), 0);
+	GUEST_ASSERT_IAR_EMPTY();
+}
+
 /*
  * Polls the IAR until it's not a spurious interrupt.
  *
@@ -391,6 +459,19 @@ static void test_preemption(struct test_args *args, struct kvm_inject_desc *f)
 		test_inject_preemption(args, MIN_SPI, 4, f->cmd);
 }
 
+static void test_restore_active(struct test_args *args, struct kvm_inject_desc *f)
+{
+	/* Test up to 4 active IRQs. Same reason as in test_preemption. */
+	if (f->sgi)
+		guest_restore_active(args, MIN_SGI, 4, f->cmd);
+
+	if (f->ppi)
+		guest_restore_active(args, MIN_PPI, 4, f->cmd);
+
+	if (f->spi)
+		guest_restore_active(args, MIN_SPI, 4, f->cmd);
+}
+
 static void guest_code(struct test_args args)
 {
 	uint32_t i, nr_irqs = args.nr_irqs;
@@ -422,6 +503,12 @@ static void guest_code(struct test_args args)
 		test_injection_failure(&args, f);
 	}
 
+	/* Restore the active state of IRQs. This would happen when live
+	 * migrating IRQs in the middle of being handled.
+	 */
+	for_each_supported_activate_fn(&args, set_active_fns, f)
+		test_restore_active(&args, f);
+
 	GUEST_DONE();
 }
 
@@ -619,6 +706,10 @@ static void run_guest_cmd(struct kvm_vm *vm, int gic_fd,
 			kvm_irq_write_ispendr_check(gic_fd, i,
 					VCPU_ID, expect_failure);
 		break;
+	case KVM_WRITE_ISACTIVER:
+		for (i = intid; i < intid + num; i++)
+			kvm_irq_write_isactiver(gic_fd, i, VCPU_ID);
+		break;
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From 9f3c16a430e8ac6b8211da106f4e4841d896ec99 Mon Sep 17 00:00:00 2001
From: Miaoqian Lin <linmq006@gmail.com>
Date: Tue, 14 Dec 2021 01:10:27 +0000
Subject: perf expr: Fix return value of ids__new()

callers of ids__new() function only do NULL checking for the return
value. ids__new() calles hashmap__new(), which may return
ERR_PTR(-ENOMEM).

Instead of changing the checking one-by-one return NULL instead of
ERR_PTR(-ENOMEM) to keep it consistent.

Signed-off-by: Miaoqian Lin <linmq006@gmail.com>
Reviewed-by: German Gomez <german.gomez@arm.com>
Tested-by: German Gomez <german.gomez@arm.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lore.kernel.org/lkml/20211214011030.20200-1-linmq006@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/expr.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/expr.c b/tools/perf/util/expr.c
index 254601060b39..666b59baeb70 100644
--- a/tools/perf/util/expr.c
+++ b/tools/perf/util/expr.c
@@ -66,7 +66,12 @@ static bool key_equal(const void *key1, const void *key2,
 
 struct hashmap *ids__new(void)
 {
-	return hashmap__new(key_hash, key_equal, NULL);
+	struct hashmap *hash;
+
+	hash = hashmap__new(key_hash, key_equal, NULL);
+	if (IS_ERR(hash))
+		return NULL;
+	return hash;
 }
 
 void ids__free(struct hashmap *ids)
-- 
cgit v1.2.3


From a78abde220243d6f44a265fe36c49957f6fa9851 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Wed, 15 Dec 2021 10:06:34 +0200
Subject: perf intel-pt: Fix parsing of VM time correlation arguments

Parser did not take ':' into account.

Example:

 Before:

  $ perf record -e intel_pt//u uname
  Linux
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.026 MB perf.data ]
  $ perf inject -i perf.data --vm-time-correlation="dry-run 123"
  $ perf inject -i perf.data --vm-time-correlation="dry-run 123:456"
  Failed to parse VM Time Correlation options
  0x620 [0x98]: failed to process type: 70 [Invalid argument]
  $

 After:

  $ perf inject -i perf.data --vm-time-correlation="dry-run 123:456"
  $

Fixes: e3ff42bdebcfeb5f ("perf intel-pt: Parse VM Time Correlation options and set up decoding")
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20211215080636.149562-2-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/intel-pt.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
index 10c3187e4c5a..e8613cbda331 100644
--- a/tools/perf/util/intel-pt.c
+++ b/tools/perf/util/intel-pt.c
@@ -3625,6 +3625,7 @@ static int intel_pt_parse_vm_tm_corr_arg(struct intel_pt *pt, char **args)
 		*args = p;
 		return 0;
 	}
+	p += 1;
 	while (1) {
 		vmcs = strtoull(p, &p, 0);
 		if (errno)
-- 
cgit v1.2.3


From 5e0c325cdb714409a5b242c9e73a1b61157abb36 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Wed, 15 Dec 2021 10:06:35 +0200
Subject: perf script: Fix CPU filtering of a script's switch events

CPU filtering was not being applied to a script's switch events.

Fixes: 5bf83c29a0ad2e78 ("perf script: Add scripting operation process_switch()")
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20211215080636.149562-3-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-script.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 9434367af166..c82b033e8942 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -2473,7 +2473,7 @@ static int process_switch_event(struct perf_tool *tool,
 	if (perf_event__process_switch(tool, event, sample, machine) < 0)
 		return -1;
 
-	if (scripting_ops && scripting_ops->process_switch)
+	if (scripting_ops && scripting_ops->process_switch && !filter_cpu(sample))
 		scripting_ops->process_switch(event, sample, machine);
 
 	if (!script->show_switch_events)
-- 
cgit v1.2.3


From 0f80bfbf4919e32f52fe1312c3900ff4fbb7eeb9 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Wed, 15 Dec 2021 10:06:36 +0200
Subject: perf scripts python: intel-pt-events.py: Fix printing of switch
 events

The intel-pt-events.py script displays only the last of consecutive switch
statements but that may not be the last switch event for the CPU. Fix by
keeping a dictionary of last context switch keyed by CPU, and make it
possible to see all switch events by adding option --all-switch-events.

Fixes: a92bf335fd82eeee ("perf scripts python: intel-pt-events.py: Add branches to script")
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20211215080636.149562-4-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/scripts/python/intel-pt-events.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/scripts/python/intel-pt-events.py b/tools/perf/scripts/python/intel-pt-events.py
index 1d3a189a9a54..66452a8ec358 100644
--- a/tools/perf/scripts/python/intel-pt-events.py
+++ b/tools/perf/scripts/python/intel-pt-events.py
@@ -32,8 +32,7 @@ try:
 except:
 	broken_pipe_exception = IOError
 
-glb_switch_str		= None
-glb_switch_printed	= True
+glb_switch_str		= {}
 glb_insn		= False
 glb_disassembler	= None
 glb_src			= False
@@ -70,6 +69,7 @@ def trace_begin():
 	ap = argparse.ArgumentParser(usage = "", add_help = False)
 	ap.add_argument("--insn-trace", action='store_true')
 	ap.add_argument("--src-trace", action='store_true')
+	ap.add_argument("--all-switch-events", action='store_true')
 	global glb_args
 	global glb_insn
 	global glb_src
@@ -256,10 +256,6 @@ def print_srccode(comm, param_dict, sample, symbol, dso, with_insn):
 	print(start_str, src_str)
 
 def do_process_event(param_dict):
-	global glb_switch_printed
-	if not glb_switch_printed:
-		print(glb_switch_str)
-		glb_switch_printed = True
 	event_attr = param_dict["attr"]
 	sample	   = param_dict["sample"]
 	raw_buf	   = param_dict["raw_buf"]
@@ -274,6 +270,11 @@ def do_process_event(param_dict):
 	dso    = get_optional(param_dict, "dso")
 	symbol = get_optional(param_dict, "symbol")
 
+	cpu = sample["cpu"]
+	if cpu in glb_switch_str:
+		print(glb_switch_str[cpu])
+		del glb_switch_str[cpu]
+
 	if name[0:12] == "instructions":
 		if glb_src:
 			print_srccode(comm, param_dict, sample, symbol, dso, True)
@@ -336,8 +337,6 @@ def auxtrace_error(typ, code, cpu, pid, tid, ip, ts, msg, cpumode, *x):
 		sys.exit(1)
 
 def context_switch(ts, cpu, pid, tid, np_pid, np_tid, machine_pid, out, out_preempt, *x):
-	global glb_switch_printed
-	global glb_switch_str
 	if out:
 		out_str = "Switch out "
 	else:
@@ -350,6 +349,10 @@ def context_switch(ts, cpu, pid, tid, np_pid, np_tid, machine_pid, out, out_pree
 		machine_str = ""
 	else:
 		machine_str = "machine PID %d" % machine_pid
-	glb_switch_str = "%16s %5d/%-5d [%03u] %9u.%09u %5d/%-5d %s %s" % \
+	switch_str = "%16s %5d/%-5d [%03u] %9u.%09u %5d/%-5d %s %s" % \
 		(out_str, pid, tid, cpu, ts / 1000000000, ts %1000000000, np_pid, np_tid, machine_str, preempt_str)
-	glb_switch_printed = False
+	if glb_args.all_switch_events:
+		print(switch_str);
+	else:
+		global glb_switch_str
+		glb_switch_str[cpu] = switch_str
-- 
cgit v1.2.3


From 3cc31d794097a0de5ac619d4a20b1975139e6b05 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 22 Dec 2021 13:39:23 -0800
Subject: libbpf: Normalize PT_REGS_xxx() macro definitions

Refactor PT_REGS macros definitions in  bpf_tracing.h to avoid excessive
duplication. We currently have classic PT_REGS_xxx() and CO-RE-enabled
PT_REGS_xxx_CORE(). We are about to add also _SYSCALL variants, which
would require excessive copying of all the per-architecture definitions.

Instead, separate architecture-specific field/register names from the
final macro that utilize them. That way for upcoming _SYSCALL variants
we'll be able to just define x86_64 exception and otherwise have one
common set of _SYSCALL macro definitions common for all architectures.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Ilya Leoshkevich <iii@linux.ibm.com>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/bpf/20211222213924.1869758-1-andrii@kernel.org
---
 tools/lib/bpf/bpf_tracing.h | 377 ++++++++++++++++++--------------------------
 1 file changed, 152 insertions(+), 225 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index db05a5937105..20fe06d0acd9 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -66,277 +66,204 @@
 
 #if defined(__KERNEL__) || defined(__VMLINUX_H__)
 
-#define PT_REGS_PARM1(x) ((x)->di)
-#define PT_REGS_PARM2(x) ((x)->si)
-#define PT_REGS_PARM3(x) ((x)->dx)
-#define PT_REGS_PARM4(x) ((x)->cx)
-#define PT_REGS_PARM5(x) ((x)->r8)
-#define PT_REGS_RET(x) ((x)->sp)
-#define PT_REGS_FP(x) ((x)->bp)
-#define PT_REGS_RC(x) ((x)->ax)
-#define PT_REGS_SP(x) ((x)->sp)
-#define PT_REGS_IP(x) ((x)->ip)
-
-#define PT_REGS_PARM1_CORE(x) BPF_CORE_READ((x), di)
-#define PT_REGS_PARM2_CORE(x) BPF_CORE_READ((x), si)
-#define PT_REGS_PARM3_CORE(x) BPF_CORE_READ((x), dx)
-#define PT_REGS_PARM4_CORE(x) BPF_CORE_READ((x), cx)
-#define PT_REGS_PARM5_CORE(x) BPF_CORE_READ((x), r8)
-#define PT_REGS_RET_CORE(x) BPF_CORE_READ((x), sp)
-#define PT_REGS_FP_CORE(x) BPF_CORE_READ((x), bp)
-#define PT_REGS_RC_CORE(x) BPF_CORE_READ((x), ax)
-#define PT_REGS_SP_CORE(x) BPF_CORE_READ((x), sp)
-#define PT_REGS_IP_CORE(x) BPF_CORE_READ((x), ip)
+#define __PT_PARM1_REG di
+#define __PT_PARM2_REG si
+#define __PT_PARM3_REG dx
+#define __PT_PARM4_REG cx
+#define __PT_PARM5_REG r8
+#define __PT_RET_REG sp
+#define __PT_FP_REG bp
+#define __PT_RC_REG ax
+#define __PT_SP_REG sp
+#define __PT_IP_REG ip
 
 #else
 
 #ifdef __i386__
-/* i386 kernel is built with -mregparm=3 */
-#define PT_REGS_PARM1(x) ((x)->eax)
-#define PT_REGS_PARM2(x) ((x)->edx)
-#define PT_REGS_PARM3(x) ((x)->ecx)
-#define PT_REGS_PARM4(x) 0
-#define PT_REGS_PARM5(x) 0
-#define PT_REGS_RET(x) ((x)->esp)
-#define PT_REGS_FP(x) ((x)->ebp)
-#define PT_REGS_RC(x) ((x)->eax)
-#define PT_REGS_SP(x) ((x)->esp)
-#define PT_REGS_IP(x) ((x)->eip)
-
-#define PT_REGS_PARM1_CORE(x) BPF_CORE_READ((x), eax)
-#define PT_REGS_PARM2_CORE(x) BPF_CORE_READ((x), edx)
-#define PT_REGS_PARM3_CORE(x) BPF_CORE_READ((x), ecx)
-#define PT_REGS_PARM4_CORE(x) 0
-#define PT_REGS_PARM5_CORE(x) 0
-#define PT_REGS_RET_CORE(x) BPF_CORE_READ((x), esp)
-#define PT_REGS_FP_CORE(x) BPF_CORE_READ((x), ebp)
-#define PT_REGS_RC_CORE(x) BPF_CORE_READ((x), eax)
-#define PT_REGS_SP_CORE(x) BPF_CORE_READ((x), esp)
-#define PT_REGS_IP_CORE(x) BPF_CORE_READ((x), eip)
-
-#else
 
-#define PT_REGS_PARM1(x) ((x)->rdi)
-#define PT_REGS_PARM2(x) ((x)->rsi)
-#define PT_REGS_PARM3(x) ((x)->rdx)
-#define PT_REGS_PARM4(x) ((x)->rcx)
-#define PT_REGS_PARM5(x) ((x)->r8)
-#define PT_REGS_RET(x) ((x)->rsp)
-#define PT_REGS_FP(x) ((x)->rbp)
-#define PT_REGS_RC(x) ((x)->rax)
-#define PT_REGS_SP(x) ((x)->rsp)
-#define PT_REGS_IP(x) ((x)->rip)
-
-#define PT_REGS_PARM1_CORE(x) BPF_CORE_READ((x), rdi)
-#define PT_REGS_PARM2_CORE(x) BPF_CORE_READ((x), rsi)
-#define PT_REGS_PARM3_CORE(x) BPF_CORE_READ((x), rdx)
-#define PT_REGS_PARM4_CORE(x) BPF_CORE_READ((x), rcx)
-#define PT_REGS_PARM5_CORE(x) BPF_CORE_READ((x), r8)
-#define PT_REGS_RET_CORE(x) BPF_CORE_READ((x), rsp)
-#define PT_REGS_FP_CORE(x) BPF_CORE_READ((x), rbp)
-#define PT_REGS_RC_CORE(x) BPF_CORE_READ((x), rax)
-#define PT_REGS_SP_CORE(x) BPF_CORE_READ((x), rsp)
-#define PT_REGS_IP_CORE(x) BPF_CORE_READ((x), rip)
-
-#endif
-#endif
+#define __PT_PARM1_REG eax
+#define __PT_PARM2_REG edx
+#define __PT_PARM3_REG ecx
+/* i386 kernel is built with -mregparm=3 */
+#define __PT_PARM4_REG __unsupported__
+#define __PT_PARM5_REG __unsupported__
+#define __PT_RET_REG esp
+#define __PT_FP_REG ebp
+#define __PT_RC_REG eax
+#define __PT_SP_REG esp
+#define __PT_IP_REG eip
+
+#else /* __i386__ */
+
+#define __PT_PARM1_REG rdi
+#define __PT_PARM2_REG rsi
+#define __PT_PARM3_REG rdx
+#define __PT_PARM4_REG rcx
+#define __PT_PARM5_REG r8
+#define __PT_RET_REG rsp
+#define __PT_FP_REG rbp
+#define __PT_RC_REG rax
+#define __PT_SP_REG rsp
+#define __PT_IP_REG rip
+
+#endif /* __i386__ */
+
+#endif /* __KERNEL__ || __VMLINUX_H__ */
 
 #elif defined(bpf_target_s390)
 
 /* s390 provides user_pt_regs instead of struct pt_regs to userspace */
-struct pt_regs;
-#define PT_REGS_S390 const volatile user_pt_regs
-#define PT_REGS_PARM1(x) (((PT_REGS_S390 *)(x))->gprs[2])
-#define PT_REGS_PARM2(x) (((PT_REGS_S390 *)(x))->gprs[3])
-#define PT_REGS_PARM3(x) (((PT_REGS_S390 *)(x))->gprs[4])
-#define PT_REGS_PARM4(x) (((PT_REGS_S390 *)(x))->gprs[5])
-#define PT_REGS_PARM5(x) (((PT_REGS_S390 *)(x))->gprs[6])
-#define PT_REGS_RET(x) (((PT_REGS_S390 *)(x))->gprs[14])
-/* Works only with CONFIG_FRAME_POINTER */
-#define PT_REGS_FP(x) (((PT_REGS_S390 *)(x))->gprs[11])
-#define PT_REGS_RC(x) (((PT_REGS_S390 *)(x))->gprs[2])
-#define PT_REGS_SP(x) (((PT_REGS_S390 *)(x))->gprs[15])
-#define PT_REGS_IP(x) (((PT_REGS_S390 *)(x))->psw.addr)
-
-#define PT_REGS_PARM1_CORE(x) BPF_CORE_READ((PT_REGS_S390 *)(x), gprs[2])
-#define PT_REGS_PARM2_CORE(x) BPF_CORE_READ((PT_REGS_S390 *)(x), gprs[3])
-#define PT_REGS_PARM3_CORE(x) BPF_CORE_READ((PT_REGS_S390 *)(x), gprs[4])
-#define PT_REGS_PARM4_CORE(x) BPF_CORE_READ((PT_REGS_S390 *)(x), gprs[5])
-#define PT_REGS_PARM5_CORE(x) BPF_CORE_READ((PT_REGS_S390 *)(x), gprs[6])
-#define PT_REGS_RET_CORE(x) BPF_CORE_READ((PT_REGS_S390 *)(x), gprs[14])
-#define PT_REGS_FP_CORE(x) BPF_CORE_READ((PT_REGS_S390 *)(x), gprs[11])
-#define PT_REGS_RC_CORE(x) BPF_CORE_READ((PT_REGS_S390 *)(x), gprs[2])
-#define PT_REGS_SP_CORE(x) BPF_CORE_READ((PT_REGS_S390 *)(x), gprs[15])
-#define PT_REGS_IP_CORE(x) BPF_CORE_READ((PT_REGS_S390 *)(x), psw.addr)
+#define __PT_REGS_CAST(x) ((const user_pt_regs *)(x))
+#define __PT_PARM1_REG gprs[2]
+#define __PT_PARM2_REG gprs[3]
+#define __PT_PARM3_REG gprs[4]
+#define __PT_PARM4_REG gprs[5]
+#define __PT_PARM5_REG gprs[6]
+#define __PT_RET_REG grps[14]
+#define __PT_FP_REG gprs[11]	/* Works only with CONFIG_FRAME_POINTER */
+#define __PT_RC_REG gprs[2]
+#define __PT_SP_REG gprs[15]
+#define __PT_IP_REG psw.addr
 
 #elif defined(bpf_target_arm)
 
-#define PT_REGS_PARM1(x) ((x)->uregs[0])
-#define PT_REGS_PARM2(x) ((x)->uregs[1])
-#define PT_REGS_PARM3(x) ((x)->uregs[2])
-#define PT_REGS_PARM4(x) ((x)->uregs[3])
-#define PT_REGS_PARM5(x) ((x)->uregs[4])
-#define PT_REGS_RET(x) ((x)->uregs[14])
-#define PT_REGS_FP(x) ((x)->uregs[11]) /* Works only with CONFIG_FRAME_POINTER */
-#define PT_REGS_RC(x) ((x)->uregs[0])
-#define PT_REGS_SP(x) ((x)->uregs[13])
-#define PT_REGS_IP(x) ((x)->uregs[12])
-
-#define PT_REGS_PARM1_CORE(x) BPF_CORE_READ((x), uregs[0])
-#define PT_REGS_PARM2_CORE(x) BPF_CORE_READ((x), uregs[1])
-#define PT_REGS_PARM3_CORE(x) BPF_CORE_READ((x), uregs[2])
-#define PT_REGS_PARM4_CORE(x) BPF_CORE_READ((x), uregs[3])
-#define PT_REGS_PARM5_CORE(x) BPF_CORE_READ((x), uregs[4])
-#define PT_REGS_RET_CORE(x) BPF_CORE_READ((x), uregs[14])
-#define PT_REGS_FP_CORE(x) BPF_CORE_READ((x), uregs[11])
-#define PT_REGS_RC_CORE(x) BPF_CORE_READ((x), uregs[0])
-#define PT_REGS_SP_CORE(x) BPF_CORE_READ((x), uregs[13])
-#define PT_REGS_IP_CORE(x) BPF_CORE_READ((x), uregs[12])
+#define __PT_PARM1_REG uregs[0]
+#define __PT_PARM2_REG uregs[1]
+#define __PT_PARM3_REG uregs[2]
+#define __PT_PARM4_REG uregs[3]
+#define __PT_PARM5_REG uregs[4]
+#define __PT_RET_REG uregs[14]
+#define __PT_FP_REG uregs[11]	/* Works only with CONFIG_FRAME_POINTER */
+#define __PT_RC_REG uregs[0]
+#define __PT_SP_REG uregs[13]
+#define __PT_IP_REG uregs[12]
 
 #elif defined(bpf_target_arm64)
 
 /* arm64 provides struct user_pt_regs instead of struct pt_regs to userspace */
-struct pt_regs;
-#define PT_REGS_ARM64 const volatile struct user_pt_regs
-#define PT_REGS_PARM1(x) (((PT_REGS_ARM64 *)(x))->regs[0])
-#define PT_REGS_PARM2(x) (((PT_REGS_ARM64 *)(x))->regs[1])
-#define PT_REGS_PARM3(x) (((PT_REGS_ARM64 *)(x))->regs[2])
-#define PT_REGS_PARM4(x) (((PT_REGS_ARM64 *)(x))->regs[3])
-#define PT_REGS_PARM5(x) (((PT_REGS_ARM64 *)(x))->regs[4])
-#define PT_REGS_RET(x) (((PT_REGS_ARM64 *)(x))->regs[30])
-/* Works only with CONFIG_FRAME_POINTER */
-#define PT_REGS_FP(x) (((PT_REGS_ARM64 *)(x))->regs[29])
-#define PT_REGS_RC(x) (((PT_REGS_ARM64 *)(x))->regs[0])
-#define PT_REGS_SP(x) (((PT_REGS_ARM64 *)(x))->sp)
-#define PT_REGS_IP(x) (((PT_REGS_ARM64 *)(x))->pc)
-
-#define PT_REGS_PARM1_CORE(x) BPF_CORE_READ((PT_REGS_ARM64 *)(x), regs[0])
-#define PT_REGS_PARM2_CORE(x) BPF_CORE_READ((PT_REGS_ARM64 *)(x), regs[1])
-#define PT_REGS_PARM3_CORE(x) BPF_CORE_READ((PT_REGS_ARM64 *)(x), regs[2])
-#define PT_REGS_PARM4_CORE(x) BPF_CORE_READ((PT_REGS_ARM64 *)(x), regs[3])
-#define PT_REGS_PARM5_CORE(x) BPF_CORE_READ((PT_REGS_ARM64 *)(x), regs[4])
-#define PT_REGS_RET_CORE(x) BPF_CORE_READ((PT_REGS_ARM64 *)(x), regs[30])
-#define PT_REGS_FP_CORE(x) BPF_CORE_READ((PT_REGS_ARM64 *)(x), regs[29])
-#define PT_REGS_RC_CORE(x) BPF_CORE_READ((PT_REGS_ARM64 *)(x), regs[0])
-#define PT_REGS_SP_CORE(x) BPF_CORE_READ((PT_REGS_ARM64 *)(x), sp)
-#define PT_REGS_IP_CORE(x) BPF_CORE_READ((PT_REGS_ARM64 *)(x), pc)
+#define __PT_REGS_CAST(x) ((const struct user_pt_regs *)(x))
+#define __PT_PARM1_REG regs[0]
+#define __PT_PARM2_REG regs[1]
+#define __PT_PARM3_REG regs[2]
+#define __PT_PARM4_REG regs[3]
+#define __PT_PARM5_REG regs[4]
+#define __PT_RET_REG regs[30]
+#define __PT_FP_REG regs[29]	/* Works only with CONFIG_FRAME_POINTER */
+#define __PT_RC_REG regs[0]
+#define __PT_SP_REG sp
+#define __PT_IP_REG pc
 
 #elif defined(bpf_target_mips)
 
-#define PT_REGS_PARM1(x) ((x)->regs[4])
-#define PT_REGS_PARM2(x) ((x)->regs[5])
-#define PT_REGS_PARM3(x) ((x)->regs[6])
-#define PT_REGS_PARM4(x) ((x)->regs[7])
-#define PT_REGS_PARM5(x) ((x)->regs[8])
-#define PT_REGS_RET(x) ((x)->regs[31])
-#define PT_REGS_FP(x) ((x)->regs[30]) /* Works only with CONFIG_FRAME_POINTER */
-#define PT_REGS_RC(x) ((x)->regs[2])
-#define PT_REGS_SP(x) ((x)->regs[29])
-#define PT_REGS_IP(x) ((x)->cp0_epc)
-
-#define PT_REGS_PARM1_CORE(x) BPF_CORE_READ((x), regs[4])
-#define PT_REGS_PARM2_CORE(x) BPF_CORE_READ((x), regs[5])
-#define PT_REGS_PARM3_CORE(x) BPF_CORE_READ((x), regs[6])
-#define PT_REGS_PARM4_CORE(x) BPF_CORE_READ((x), regs[7])
-#define PT_REGS_PARM5_CORE(x) BPF_CORE_READ((x), regs[8])
-#define PT_REGS_RET_CORE(x) BPF_CORE_READ((x), regs[31])
-#define PT_REGS_FP_CORE(x) BPF_CORE_READ((x), regs[30])
-#define PT_REGS_RC_CORE(x) BPF_CORE_READ((x), regs[2])
-#define PT_REGS_SP_CORE(x) BPF_CORE_READ((x), regs[29])
-#define PT_REGS_IP_CORE(x) BPF_CORE_READ((x), cp0_epc)
+#define __PT_PARM1_REG regs[4]
+#define __PT_PARM2_REG regs[5]
+#define __PT_PARM3_REG regs[6]
+#define __PT_PARM4_REG regs[7]
+#define __PT_PARM5_REG regs[8]
+#define __PT_RET_REG regs[31]
+#define __PT_FP_REG regs[30]	/* Works only with CONFIG_FRAME_POINTER */
+#define __PT_RC_REG regs[2]
+#define __PT_SP_REG regs[29]
+#define __PT_IP_REG cp0_epc
 
 #elif defined(bpf_target_powerpc)
 
-#define PT_REGS_PARM1(x) ((x)->gpr[3])
-#define PT_REGS_PARM2(x) ((x)->gpr[4])
-#define PT_REGS_PARM3(x) ((x)->gpr[5])
-#define PT_REGS_PARM4(x) ((x)->gpr[6])
-#define PT_REGS_PARM5(x) ((x)->gpr[7])
-#define PT_REGS_RC(x) ((x)->gpr[3])
-#define PT_REGS_SP(x) ((x)->sp)
-#define PT_REGS_IP(x) ((x)->nip)
-
-#define PT_REGS_PARM1_CORE(x) BPF_CORE_READ((x), gpr[3])
-#define PT_REGS_PARM2_CORE(x) BPF_CORE_READ((x), gpr[4])
-#define PT_REGS_PARM3_CORE(x) BPF_CORE_READ((x), gpr[5])
-#define PT_REGS_PARM4_CORE(x) BPF_CORE_READ((x), gpr[6])
-#define PT_REGS_PARM5_CORE(x) BPF_CORE_READ((x), gpr[7])
-#define PT_REGS_RC_CORE(x) BPF_CORE_READ((x), gpr[3])
-#define PT_REGS_SP_CORE(x) BPF_CORE_READ((x), sp)
-#define PT_REGS_IP_CORE(x) BPF_CORE_READ((x), nip)
+#define __PT_PARM1_REG gpr[3]
+#define __PT_PARM2_REG gpr[4]
+#define __PT_PARM3_REG gpr[5]
+#define __PT_PARM4_REG gpr[6]
+#define __PT_PARM5_REG gpr[7]
+#define __PT_RET_REG regs[31]
+#define __PT_FP_REG __unsupported__
+#define __PT_RC_REG gpr[3]
+#define __PT_SP_REG sp
+#define __PT_IP_REG nip
 
 #elif defined(bpf_target_sparc)
 
-#define PT_REGS_PARM1(x) ((x)->u_regs[UREG_I0])
-#define PT_REGS_PARM2(x) ((x)->u_regs[UREG_I1])
-#define PT_REGS_PARM3(x) ((x)->u_regs[UREG_I2])
-#define PT_REGS_PARM4(x) ((x)->u_regs[UREG_I3])
-#define PT_REGS_PARM5(x) ((x)->u_regs[UREG_I4])
-#define PT_REGS_RET(x) ((x)->u_regs[UREG_I7])
-#define PT_REGS_RC(x) ((x)->u_regs[UREG_I0])
-#define PT_REGS_SP(x) ((x)->u_regs[UREG_FP])
-
-#define PT_REGS_PARM1_CORE(x) BPF_CORE_READ((x), u_regs[UREG_I0])
-#define PT_REGS_PARM2_CORE(x) BPF_CORE_READ((x), u_regs[UREG_I1])
-#define PT_REGS_PARM3_CORE(x) BPF_CORE_READ((x), u_regs[UREG_I2])
-#define PT_REGS_PARM4_CORE(x) BPF_CORE_READ((x), u_regs[UREG_I3])
-#define PT_REGS_PARM5_CORE(x) BPF_CORE_READ((x), u_regs[UREG_I4])
-#define PT_REGS_RET_CORE(x) BPF_CORE_READ((x), u_regs[UREG_I7])
-#define PT_REGS_RC_CORE(x) BPF_CORE_READ((x), u_regs[UREG_I0])
-#define PT_REGS_SP_CORE(x) BPF_CORE_READ((x), u_regs[UREG_FP])
-
+#define __PT_PARM1_REG u_regs[UREG_I0]
+#define __PT_PARM2_REG u_regs[UREG_I1]
+#define __PT_PARM3_REG u_regs[UREG_I2]
+#define __PT_PARM4_REG u_regs[UREG_I3]
+#define __PT_PARM5_REG u_regs[UREG_I4]
+#define __PT_RET_REG u_regs[UREG_I7]
+#define __PT_FP_REG __unsupported__
+#define __PT_RC_REG u_regs[UREG_I0]
+#define __PT_SP_REG u_regs[UREG_FP]
 /* Should this also be a bpf_target check for the sparc case? */
 #if defined(__arch64__)
-#define PT_REGS_IP(x) ((x)->tpc)
-#define PT_REGS_IP_CORE(x) BPF_CORE_READ((x), tpc)
+#define __PT_IP_REG tpc
 #else
-#define PT_REGS_IP(x) ((x)->pc)
-#define PT_REGS_IP_CORE(x) BPF_CORE_READ((x), pc)
+#define __PT_IP_REG pc
 #endif
 
 #elif defined(bpf_target_riscv)
 
+#define __PT_REGS_CAST(x) ((const struct user_regs_struct *)(x))
+#define __PT_PARM1_REG a0
+#define __PT_PARM2_REG a1
+#define __PT_PARM3_REG a2
+#define __PT_PARM4_REG a3
+#define __PT_PARM5_REG a4
+#define __PT_RET_REG ra
+#define __PT_FP_REG fp
+#define __PT_RC_REG a5
+#define __PT_SP_REG sp
+#define __PT_IP_REG epc
+
+#endif
+
+#if defined(bpf_target_defined)
+
 struct pt_regs;
-#define PT_REGS_RV const volatile struct user_regs_struct
-#define PT_REGS_PARM1(x) (((PT_REGS_RV *)(x))->a0)
-#define PT_REGS_PARM2(x) (((PT_REGS_RV *)(x))->a1)
-#define PT_REGS_PARM3(x) (((PT_REGS_RV *)(x))->a2)
-#define PT_REGS_PARM4(x) (((PT_REGS_RV *)(x))->a3)
-#define PT_REGS_PARM5(x) (((PT_REGS_RV *)(x))->a4)
-#define PT_REGS_RET(x) (((PT_REGS_RV *)(x))->ra)
-#define PT_REGS_FP(x) (((PT_REGS_RV *)(x))->s5)
-#define PT_REGS_RC(x) (((PT_REGS_RV *)(x))->a5)
-#define PT_REGS_SP(x) (((PT_REGS_RV *)(x))->sp)
-#define PT_REGS_IP(x) (((PT_REGS_RV *)(x))->epc)
-
-#define PT_REGS_PARM1_CORE(x) BPF_CORE_READ((PT_REGS_RV *)(x), a0)
-#define PT_REGS_PARM2_CORE(x) BPF_CORE_READ((PT_REGS_RV *)(x), a1)
-#define PT_REGS_PARM3_CORE(x) BPF_CORE_READ((PT_REGS_RV *)(x), a2)
-#define PT_REGS_PARM4_CORE(x) BPF_CORE_READ((PT_REGS_RV *)(x), a3)
-#define PT_REGS_PARM5_CORE(x) BPF_CORE_READ((PT_REGS_RV *)(x), a4)
-#define PT_REGS_RET_CORE(x) BPF_CORE_READ((PT_REGS_RV *)(x), ra)
-#define PT_REGS_FP_CORE(x) BPF_CORE_READ((PT_REGS_RV *)(x), fp)
-#define PT_REGS_RC_CORE(x) BPF_CORE_READ((PT_REGS_RV *)(x), a5)
-#define PT_REGS_SP_CORE(x) BPF_CORE_READ((PT_REGS_RV *)(x), sp)
-#define PT_REGS_IP_CORE(x) BPF_CORE_READ((PT_REGS_RV *)(x), epc)
 
+/* allow some architecutres to override `struct pt_regs` */
+#ifndef __PT_REGS_CAST
+#define __PT_REGS_CAST(x) (x)
 #endif
 
+#define PT_REGS_PARM1(x) (__PT_REGS_CAST(x)->__PT_PARM1_REG)
+#define PT_REGS_PARM2(x) (__PT_REGS_CAST(x)->__PT_PARM2_REG)
+#define PT_REGS_PARM3(x) (__PT_REGS_CAST(x)->__PT_PARM3_REG)
+#define PT_REGS_PARM4(x) (__PT_REGS_CAST(x)->__PT_PARM4_REG)
+#define PT_REGS_PARM5(x) (__PT_REGS_CAST(x)->__PT_PARM5_REG)
+#define PT_REGS_RET(x) (__PT_REGS_CAST(x)->__PT_RET_REG)
+#define PT_REGS_FP(x) (__PT_REGS_CAST(x)->__PT_FP_REG)
+#define PT_REGS_RC(x) (__PT_REGS_CAST(x)->__PT_RC_REG)
+#define PT_REGS_SP(x) (__PT_REGS_CAST(x)->__PT_SP_REG)
+#define PT_REGS_IP(x) (__PT_REGS_CAST(x)->__PT_IP_REG)
+
+#define PT_REGS_PARM1_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM1_REG)
+#define PT_REGS_PARM2_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM2_REG)
+#define PT_REGS_PARM3_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM3_REG)
+#define PT_REGS_PARM4_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM4_REG)
+#define PT_REGS_PARM5_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM5_REG)
+#define PT_REGS_RET_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_RET_REG)
+#define PT_REGS_FP_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_FP_REG)
+#define PT_REGS_RC_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_RC_REG)
+#define PT_REGS_SP_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_SP_REG)
+#define PT_REGS_IP_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_IP_REG)
+
 #if defined(bpf_target_powerpc)
+
 #define BPF_KPROBE_READ_RET_IP(ip, ctx)		({ (ip) = (ctx)->link; })
 #define BPF_KRETPROBE_READ_RET_IP		BPF_KPROBE_READ_RET_IP
+
 #elif defined(bpf_target_sparc)
+
 #define BPF_KPROBE_READ_RET_IP(ip, ctx)		({ (ip) = PT_REGS_RET(ctx); })
 #define BPF_KRETPROBE_READ_RET_IP		BPF_KPROBE_READ_RET_IP
-#elif defined(bpf_target_defined)
+
+#else
+
 #define BPF_KPROBE_READ_RET_IP(ip, ctx)					    \
 	({ bpf_probe_read_kernel(&(ip), sizeof(ip), (void *)PT_REGS_RET(ctx)); })
 #define BPF_KRETPROBE_READ_RET_IP(ip, ctx)				    \
-	({ bpf_probe_read_kernel(&(ip), sizeof(ip),			    \
-			  (void *)(PT_REGS_FP(ctx) + sizeof(ip))); })
+	({ bpf_probe_read_kernel(&(ip), sizeof(ip), (void *)(PT_REGS_FP(ctx) + sizeof(ip))); })
+
 #endif
 
-#if !defined(bpf_target_defined)
+#else /* defined(bpf_target_defined) */
 
 #define PT_REGS_PARM1(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
 #define PT_REGS_PARM2(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
@@ -363,7 +290,7 @@ struct pt_regs;
 #define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
 #define BPF_KRETPROBE_READ_RET_IP(ip, ctx) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
 
-#endif /* !defined(bpf_target_defined) */
+#endif /* defined(bpf_target_defined) */
 
 #ifndef ___bpf_concat
 #define ___bpf_concat(a, b) a ## b
-- 
cgit v1.2.3


From f60edf5b53848f2cf53e7e4b716ed8e45563bb12 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 22 Dec 2021 13:39:24 -0800
Subject: libbpf: Use 100-character limit to make bpf_tracing.h easier to read

Improve bpf_tracing.h's macro definition readability by keeping them
single-line and better aligned. This makes it easier to follow all those
variadic patterns.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20211222213924.1869758-2-andrii@kernel.org
---
 tools/lib/bpf/bpf_tracing.h | 54 ++++++++++++++++++---------------------------
 1 file changed, 22 insertions(+), 32 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index 20fe06d0acd9..90f56b0f585f 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -302,25 +302,23 @@ struct pt_regs;
 #define ___bpf_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _a, _b, _c, N, ...) N
 #endif
 #ifndef ___bpf_narg
-#define ___bpf_narg(...) \
-	___bpf_nth(_, ##__VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
+#define ___bpf_narg(...) ___bpf_nth(_, ##__VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
 #endif
 
-#define ___bpf_ctx_cast0() ctx
-#define ___bpf_ctx_cast1(x) ___bpf_ctx_cast0(), (void *)ctx[0]
-#define ___bpf_ctx_cast2(x, args...) ___bpf_ctx_cast1(args), (void *)ctx[1]
-#define ___bpf_ctx_cast3(x, args...) ___bpf_ctx_cast2(args), (void *)ctx[2]
-#define ___bpf_ctx_cast4(x, args...) ___bpf_ctx_cast3(args), (void *)ctx[3]
-#define ___bpf_ctx_cast5(x, args...) ___bpf_ctx_cast4(args), (void *)ctx[4]
-#define ___bpf_ctx_cast6(x, args...) ___bpf_ctx_cast5(args), (void *)ctx[5]
-#define ___bpf_ctx_cast7(x, args...) ___bpf_ctx_cast6(args), (void *)ctx[6]
-#define ___bpf_ctx_cast8(x, args...) ___bpf_ctx_cast7(args), (void *)ctx[7]
-#define ___bpf_ctx_cast9(x, args...) ___bpf_ctx_cast8(args), (void *)ctx[8]
+#define ___bpf_ctx_cast0()            ctx
+#define ___bpf_ctx_cast1(x)           ___bpf_ctx_cast0(), (void *)ctx[0]
+#define ___bpf_ctx_cast2(x, args...)  ___bpf_ctx_cast1(args), (void *)ctx[1]
+#define ___bpf_ctx_cast3(x, args...)  ___bpf_ctx_cast2(args), (void *)ctx[2]
+#define ___bpf_ctx_cast4(x, args...)  ___bpf_ctx_cast3(args), (void *)ctx[3]
+#define ___bpf_ctx_cast5(x, args...)  ___bpf_ctx_cast4(args), (void *)ctx[4]
+#define ___bpf_ctx_cast6(x, args...)  ___bpf_ctx_cast5(args), (void *)ctx[5]
+#define ___bpf_ctx_cast7(x, args...)  ___bpf_ctx_cast6(args), (void *)ctx[6]
+#define ___bpf_ctx_cast8(x, args...)  ___bpf_ctx_cast7(args), (void *)ctx[7]
+#define ___bpf_ctx_cast9(x, args...)  ___bpf_ctx_cast8(args), (void *)ctx[8]
 #define ___bpf_ctx_cast10(x, args...) ___bpf_ctx_cast9(args), (void *)ctx[9]
 #define ___bpf_ctx_cast11(x, args...) ___bpf_ctx_cast10(args), (void *)ctx[10]
 #define ___bpf_ctx_cast12(x, args...) ___bpf_ctx_cast11(args), (void *)ctx[11]
-#define ___bpf_ctx_cast(args...) \
-	___bpf_apply(___bpf_ctx_cast, ___bpf_narg(args))(args)
+#define ___bpf_ctx_cast(args...)      ___bpf_apply(___bpf_ctx_cast, ___bpf_narg(args))(args)
 
 /*
  * BPF_PROG is a convenience wrapper for generic tp_btf/fentry/fexit and
@@ -353,19 +351,13 @@ ____##name(unsigned long long *ctx, ##args)
 
 struct pt_regs;
 
-#define ___bpf_kprobe_args0() ctx
-#define ___bpf_kprobe_args1(x) \
-	___bpf_kprobe_args0(), (void *)PT_REGS_PARM1(ctx)
-#define ___bpf_kprobe_args2(x, args...) \
-	___bpf_kprobe_args1(args), (void *)PT_REGS_PARM2(ctx)
-#define ___bpf_kprobe_args3(x, args...) \
-	___bpf_kprobe_args2(args), (void *)PT_REGS_PARM3(ctx)
-#define ___bpf_kprobe_args4(x, args...) \
-	___bpf_kprobe_args3(args), (void *)PT_REGS_PARM4(ctx)
-#define ___bpf_kprobe_args5(x, args...) \
-	___bpf_kprobe_args4(args), (void *)PT_REGS_PARM5(ctx)
-#define ___bpf_kprobe_args(args...) \
-	___bpf_apply(___bpf_kprobe_args, ___bpf_narg(args))(args)
+#define ___bpf_kprobe_args0()           ctx
+#define ___bpf_kprobe_args1(x)          ___bpf_kprobe_args0(), (void *)PT_REGS_PARM1(ctx)
+#define ___bpf_kprobe_args2(x, args...) ___bpf_kprobe_args1(args), (void *)PT_REGS_PARM2(ctx)
+#define ___bpf_kprobe_args3(x, args...) ___bpf_kprobe_args2(args), (void *)PT_REGS_PARM3(ctx)
+#define ___bpf_kprobe_args4(x, args...) ___bpf_kprobe_args3(args), (void *)PT_REGS_PARM4(ctx)
+#define ___bpf_kprobe_args5(x, args...) ___bpf_kprobe_args4(args), (void *)PT_REGS_PARM5(ctx)
+#define ___bpf_kprobe_args(args...)     ___bpf_apply(___bpf_kprobe_args, ___bpf_narg(args))(args)
 
 /*
  * BPF_KPROBE serves the same purpose for kprobes as BPF_PROG for
@@ -391,11 +383,9 @@ typeof(name(0)) name(struct pt_regs *ctx)				    \
 static __attribute__((always_inline)) typeof(name(0))			    \
 ____##name(struct pt_regs *ctx, ##args)
 
-#define ___bpf_kretprobe_args0() ctx
-#define ___bpf_kretprobe_args1(x) \
-	___bpf_kretprobe_args0(), (void *)PT_REGS_RC(ctx)
-#define ___bpf_kretprobe_args(args...) \
-	___bpf_apply(___bpf_kretprobe_args, ___bpf_narg(args))(args)
+#define ___bpf_kretprobe_args0()       ctx
+#define ___bpf_kretprobe_args1(x)      ___bpf_kretprobe_args0(), (void *)PT_REGS_RC(ctx)
+#define ___bpf_kretprobe_args(args...) ___bpf_apply(___bpf_kretprobe_args, ___bpf_narg(args))(args)
 
 /*
  * BPF_KRETPROBE is similar to BPF_KPROBE, except, it only provides optional
-- 
cgit v1.2.3


From 5b3d72987701d51bf31823b39db49d10970f5c2d Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 22 Dec 2021 15:10:03 -0800
Subject: libbpf: Improve LINUX_VERSION_CODE detection

Ubuntu reports incorrect kernel version through uname(), which on older
kernels leads to kprobe BPF programs failing to load due to the version
check mismatch.

Accommodate Ubuntu's quirks with LINUX_VERSION_CODE by using
Ubuntu-specific /proc/version_code to fetch major/minor/patch versions
to form LINUX_VERSION_CODE.

While at it, consolide libbpf's kernel version detection code between
libbpf.c and libbpf_probes.c.

  [0] Closes: https://github.com/libbpf/libbpf/issues/421

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20211222231003.2334940-1-andrii@kernel.org
---
 tools/lib/bpf/libbpf.c          | 27 ++++++++++++++++++++++++++-
 tools/lib/bpf/libbpf_internal.h |  2 ++
 tools/lib/bpf/libbpf_probes.c   | 16 ----------------
 3 files changed, 28 insertions(+), 17 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index cf862a19222b..9cb99d1e2385 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -795,11 +795,36 @@ bpf_object__add_programs(struct bpf_object *obj, Elf_Data *sec_data,
 	return 0;
 }
 
-static __u32 get_kernel_version(void)
+__u32 get_kernel_version(void)
 {
+	/* On Ubuntu LINUX_VERSION_CODE doesn't correspond to info.release,
+	 * but Ubuntu provides /proc/version_signature file, as described at
+	 * https://ubuntu.com/kernel, with an example contents below, which we
+	 * can use to get a proper LINUX_VERSION_CODE.
+	 *
+	 *   Ubuntu 5.4.0-12.15-generic 5.4.8
+	 *
+	 * In the above, 5.4.8 is what kernel is actually expecting, while
+	 * uname() call will return 5.4.0 in info.release.
+	 */
+	const char *ubuntu_kver_file = "/proc/version_signature";
 	__u32 major, minor, patch;
 	struct utsname info;
 
+	if (access(ubuntu_kver_file, R_OK) == 0) {
+		FILE *f;
+
+		f = fopen(ubuntu_kver_file, "r");
+		if (f) {
+			if (fscanf(f, "%*s %*s %d.%d.%d\n", &major, &minor, &patch) == 3) {
+				fclose(f);
+				return KERNEL_VERSION(major, minor, patch);
+			}
+			fclose(f);
+		}
+		/* something went wrong, fall back to uname() approach */
+	}
+
 	uname(&info);
 	if (sscanf(info.release, "%u.%u.%u", &major, &minor, &patch) != 3)
 		return 0;
diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index 5dbe4f463880..1565679eb432 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -188,6 +188,8 @@ static inline void libbpf_strlcpy(char *dst, const char *src, size_t sz)
 	dst[i] = '\0';
 }
 
+__u32 get_kernel_version(void);
+
 struct btf;
 struct btf_type;
 
diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
index 9d3985283dfc..97b06cede56f 100644
--- a/tools/lib/bpf/libbpf_probes.c
+++ b/tools/lib/bpf/libbpf_probes.c
@@ -48,22 +48,6 @@ static int get_vendor_id(int ifindex)
 	return strtol(buf, NULL, 0);
 }
 
-static int get_kernel_version(void)
-{
-	int version, subversion, patchlevel;
-	struct utsname utsn;
-
-	/* Return 0 on failure, and attempt to probe with empty kversion */
-	if (uname(&utsn))
-		return 0;
-
-	if (sscanf(utsn.release, "%d.%d.%d",
-		   &version, &subversion, &patchlevel) != 3)
-		return 0;
-
-	return (version << 16) + (subversion << 8) + patchlevel;
-}
-
 static int probe_prog_load(enum bpf_prog_type prog_type,
 			   const struct bpf_insn *insns, size_t insns_cnt,
 			   char *log_buf, size_t log_buf_sz,
-- 
cgit v1.2.3


From 9c1952aeaa98b3cfc49e2a79cb2c7d6a674213e9 Mon Sep 17 00:00:00 2001
From: wujianguo <wujianguo@chinatelecom.cn>
Date: Wed, 29 Dec 2021 18:58:10 +0800
Subject: selftests/net: udpgso_bench_tx: fix dst ip argument

udpgso_bench_tx call setup_sockaddr() for dest address before
parsing all arguments, if we specify "-p ${dst_port}" after "-D ${dst_ip}",
then ${dst_port} will be ignored, and using default cfg_port 8000.

This will cause test case "multiple GRO socks" failed in udpgro.sh.

Setup sockaddr after parsing all arguments.

Fixes: 3a687bef148d ("selftests: udp gso benchmark")
Signed-off-by: Jianguo Wu <wujianguo@chinatelecom.cn>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://lore.kernel.org/r/ff620d9f-5b52-06ab-5286-44b945453002@163.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/udpgso_bench_tx.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/udpgso_bench_tx.c b/tools/testing/selftests/net/udpgso_bench_tx.c
index 17512a43885e..f1fdaa270291 100644
--- a/tools/testing/selftests/net/udpgso_bench_tx.c
+++ b/tools/testing/selftests/net/udpgso_bench_tx.c
@@ -419,6 +419,7 @@ static void usage(const char *filepath)
 
 static void parse_opts(int argc, char **argv)
 {
+	const char *bind_addr = NULL;
 	int max_len, hdrlen;
 	int c;
 
@@ -446,7 +447,7 @@ static void parse_opts(int argc, char **argv)
 			cfg_cpu = strtol(optarg, NULL, 0);
 			break;
 		case 'D':
-			setup_sockaddr(cfg_family, optarg, &cfg_dst_addr);
+			bind_addr = optarg;
 			break;
 		case 'l':
 			cfg_runtime_ms = strtoul(optarg, NULL, 10) * 1000;
@@ -492,6 +493,11 @@ static void parse_opts(int argc, char **argv)
 		}
 	}
 
+	if (!bind_addr)
+		bind_addr = cfg_family == PF_INET6 ? "::" : "0.0.0.0";
+
+	setup_sockaddr(cfg_family, bind_addr, &cfg_dst_addr);
+
 	if (optind != argc)
 		usage(argv[0]);
 
-- 
cgit v1.2.3


From add25d6d6c85f7b6d00a055ee0a4169acf845681 Mon Sep 17 00:00:00 2001
From: Jianguo Wu <wujianguo@chinatelecom.cn>
Date: Wed, 29 Dec 2021 15:27:30 +0800
Subject: selftests: net: Fix a typo in udpgro_fwd.sh

$rvs -> $rcv

Fixes: a062260a9d5f ("selftests: net: add UDP GRO forwarding self-tests")
Signed-off-by: Jianguo Wu <wujianguo@chinatelecom.cn>
Link: https://lore.kernel.org/r/d247d7c8-a03a-0abf-3c71-4006a051d133@163.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/udpgro_fwd.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/udpgro_fwd.sh b/tools/testing/selftests/net/udpgro_fwd.sh
index 7f26591f236b..6a3985b8cd7f 100755
--- a/tools/testing/selftests/net/udpgro_fwd.sh
+++ b/tools/testing/selftests/net/udpgro_fwd.sh
@@ -132,7 +132,7 @@ run_test() {
 	local rcv=`ip netns exec $NS_DST $ipt"-save" -c | grep 'dport 8000' | \
 							  sed -e 's/\[//' -e 's/:.*//'`
 	if [ $rcv != $pkts ]; then
-		echo " fail - received $rvs packets, expected $pkts"
+		echo " fail - received $rcv packets, expected $pkts"
 		ret=1
 		return
 	fi
-- 
cgit v1.2.3


From 0ae6eff2978ee118ce2b536090af0682db13bb83 Mon Sep 17 00:00:00 2001
From: KP Singh <kpsingh@kernel.org>
Date: Fri, 24 Dec 2021 15:29:16 +0000
Subject: bpf/selftests: Update local storage selftest for sleepable programs

Remove the spin lock logic and update the selftests to use sleepable
programs to use a mix of sleepable and non-sleepable programs. It's more
useful to test the sleepable programs since the tests don't really need
spinlocks.

Signed-off-by: KP Singh <kpsingh@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20211224152916.1550677-3-kpsingh@kernel.org
---
 .../selftests/bpf/prog_tests/test_local_storage.c  | 20 ++++++------------
 tools/testing/selftests/bpf/progs/local_storage.c  | 24 +++++-----------------
 2 files changed, 11 insertions(+), 33 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/test_local_storage.c b/tools/testing/selftests/bpf/prog_tests/test_local_storage.c
index d2c16eaae367..26ac26a88026 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_local_storage.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_local_storage.c
@@ -28,10 +28,6 @@ static unsigned int duration;
 struct storage {
 	void *inode;
 	unsigned int value;
-	/* Lock ensures that spin locked versions of local stoage operations
-	 * also work, most operations in this tests are still single threaded
-	 */
-	struct bpf_spin_lock lock;
 };
 
 /* Fork and exec the provided rm binary and return the exit code of the
@@ -66,27 +62,24 @@ static int run_self_unlink(int *monitored_pid, const char *rm_path)
 
 static bool check_syscall_operations(int map_fd, int obj_fd)
 {
-	struct storage val = { .value = TEST_STORAGE_VALUE, .lock = { 0 } },
-		       lookup_val = { .value = 0, .lock = { 0 } };
+	struct storage val = { .value = TEST_STORAGE_VALUE },
+		       lookup_val = { .value = 0 };
 	int err;
 
 	/* Looking up an existing element should fail initially */
-	err = bpf_map_lookup_elem_flags(map_fd, &obj_fd, &lookup_val,
-					BPF_F_LOCK);
+	err = bpf_map_lookup_elem_flags(map_fd, &obj_fd, &lookup_val, 0);
 	if (CHECK(!err || errno != ENOENT, "bpf_map_lookup_elem",
 		  "err:%d errno:%d\n", err, errno))
 		return false;
 
 	/* Create a new element */
-	err = bpf_map_update_elem(map_fd, &obj_fd, &val,
-				  BPF_NOEXIST | BPF_F_LOCK);
+	err = bpf_map_update_elem(map_fd, &obj_fd, &val, BPF_NOEXIST);
 	if (CHECK(err < 0, "bpf_map_update_elem", "err:%d errno:%d\n", err,
 		  errno))
 		return false;
 
 	/* Lookup the newly created element */
-	err = bpf_map_lookup_elem_flags(map_fd, &obj_fd, &lookup_val,
-					BPF_F_LOCK);
+	err = bpf_map_lookup_elem_flags(map_fd, &obj_fd, &lookup_val, 0);
 	if (CHECK(err < 0, "bpf_map_lookup_elem", "err:%d errno:%d", err,
 		  errno))
 		return false;
@@ -102,8 +95,7 @@ static bool check_syscall_operations(int map_fd, int obj_fd)
 		return false;
 
 	/* The lookup should fail, now that the element has been deleted */
-	err = bpf_map_lookup_elem_flags(map_fd, &obj_fd, &lookup_val,
-					BPF_F_LOCK);
+	err = bpf_map_lookup_elem_flags(map_fd, &obj_fd, &lookup_val, 0);
 	if (CHECK(!err || errno != ENOENT, "bpf_map_lookup_elem",
 		  "err:%d errno:%d\n", err, errno))
 		return false;
diff --git a/tools/testing/selftests/bpf/progs/local_storage.c b/tools/testing/selftests/bpf/progs/local_storage.c
index 95868bc7ada9..9b1f9b75d5c2 100644
--- a/tools/testing/selftests/bpf/progs/local_storage.c
+++ b/tools/testing/selftests/bpf/progs/local_storage.c
@@ -20,7 +20,6 @@ int sk_storage_result = -1;
 struct local_storage {
 	struct inode *exec_inode;
 	__u32 value;
-	struct bpf_spin_lock lock;
 };
 
 struct {
@@ -58,9 +57,7 @@ int BPF_PROG(unlink_hook, struct inode *dir, struct dentry *victim)
 				       bpf_get_current_task_btf(), 0, 0);
 	if (storage) {
 		/* Don't let an executable delete itself */
-		bpf_spin_lock(&storage->lock);
 		is_self_unlink = storage->exec_inode == victim->d_inode;
-		bpf_spin_unlock(&storage->lock);
 		if (is_self_unlink)
 			return -EPERM;
 	}
@@ -68,7 +65,7 @@ int BPF_PROG(unlink_hook, struct inode *dir, struct dentry *victim)
 	return 0;
 }
 
-SEC("lsm/inode_rename")
+SEC("lsm.s/inode_rename")
 int BPF_PROG(inode_rename, struct inode *old_dir, struct dentry *old_dentry,
 	     struct inode *new_dir, struct dentry *new_dentry,
 	     unsigned int flags)
@@ -89,10 +86,8 @@ int BPF_PROG(inode_rename, struct inode *old_dir, struct dentry *old_dentry,
 	if (!storage)
 		return 0;
 
-	bpf_spin_lock(&storage->lock);
 	if (storage->value != DUMMY_STORAGE_VALUE)
 		inode_storage_result = -1;
-	bpf_spin_unlock(&storage->lock);
 
 	err = bpf_inode_storage_delete(&inode_storage_map, old_dentry->d_inode);
 	if (!err)
@@ -101,7 +96,7 @@ int BPF_PROG(inode_rename, struct inode *old_dir, struct dentry *old_dentry,
 	return 0;
 }
 
-SEC("lsm/socket_bind")
+SEC("lsm.s/socket_bind")
 int BPF_PROG(socket_bind, struct socket *sock, struct sockaddr *address,
 	     int addrlen)
 {
@@ -117,10 +112,8 @@ int BPF_PROG(socket_bind, struct socket *sock, struct sockaddr *address,
 	if (!storage)
 		return 0;
 
-	bpf_spin_lock(&storage->lock);
 	if (storage->value != DUMMY_STORAGE_VALUE)
 		sk_storage_result = -1;
-	bpf_spin_unlock(&storage->lock);
 
 	err = bpf_sk_storage_delete(&sk_storage_map, sock->sk);
 	if (!err)
@@ -129,7 +122,7 @@ int BPF_PROG(socket_bind, struct socket *sock, struct sockaddr *address,
 	return 0;
 }
 
-SEC("lsm/socket_post_create")
+SEC("lsm.s/socket_post_create")
 int BPF_PROG(socket_post_create, struct socket *sock, int family, int type,
 	     int protocol, int kern)
 {
@@ -144,9 +137,7 @@ int BPF_PROG(socket_post_create, struct socket *sock, int family, int type,
 	if (!storage)
 		return 0;
 
-	bpf_spin_lock(&storage->lock);
 	storage->value = DUMMY_STORAGE_VALUE;
-	bpf_spin_unlock(&storage->lock);
 
 	return 0;
 }
@@ -154,7 +145,7 @@ int BPF_PROG(socket_post_create, struct socket *sock, int family, int type,
 /* This uses the local storage to remember the inode of the binary that a
  * process was originally executing.
  */
-SEC("lsm/bprm_committed_creds")
+SEC("lsm.s/bprm_committed_creds")
 void BPF_PROG(exec, struct linux_binprm *bprm)
 {
 	__u32 pid = bpf_get_current_pid_tgid() >> 32;
@@ -166,18 +157,13 @@ void BPF_PROG(exec, struct linux_binprm *bprm)
 	storage = bpf_task_storage_get(&task_storage_map,
 				       bpf_get_current_task_btf(), 0,
 				       BPF_LOCAL_STORAGE_GET_F_CREATE);
-	if (storage) {
-		bpf_spin_lock(&storage->lock);
+	if (storage)
 		storage->exec_inode = bprm->file->f_inode;
-		bpf_spin_unlock(&storage->lock);
-	}
 
 	storage = bpf_inode_storage_get(&inode_storage_map, bprm->file->f_inode,
 					0, BPF_LOCAL_STORAGE_GET_F_CREATE);
 	if (!storage)
 		return;
 
-	bpf_spin_lock(&storage->lock);
 	storage->value = DUMMY_STORAGE_VALUE;
-	bpf_spin_unlock(&storage->lock);
 }
-- 
cgit v1.2.3


From 8b3170e07539855ee91bc5e2fa7780a4c9b5c7aa Mon Sep 17 00:00:00 2001
From: Jianguo Wu <wujianguo@chinatelecom.cn>
Date: Thu, 30 Dec 2021 18:40:29 +0800
Subject: selftests: net: using ping6 for IPv6 in udpgro_fwd.sh

udpgro_fwd.sh output following message:
  ping: 2001:db8:1::100: Address family for hostname not supported

Using ping6 when pinging IPv6 addresses.

Fixes: a062260a9d5f ("selftests: net: add UDP GRO forwarding self-tests")
Signed-off-by: Jianguo Wu <wujianguo@chinatelecom.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/udpgro_fwd.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/udpgro_fwd.sh b/tools/testing/selftests/net/udpgro_fwd.sh
index 6a3985b8cd7f..3ea73013d956 100755
--- a/tools/testing/selftests/net/udpgro_fwd.sh
+++ b/tools/testing/selftests/net/udpgro_fwd.sh
@@ -185,6 +185,7 @@ for family in 4 6; do
 	IPT=iptables
 	SUFFIX=24
 	VXDEV=vxlan
+	PING=ping
 
 	if [ $family = 6 ]; then
 		BM_NET=$BM_NET_V6
@@ -192,6 +193,7 @@ for family in 4 6; do
 		SUFFIX="64 nodad"
 		VXDEV=vxlan6
 		IPT=ip6tables
+		PING="ping6"
 	fi
 
 	echo "IPv$family"
@@ -237,7 +239,7 @@ for family in 4 6; do
 
 	# load arp cache before running the test to reduce the amount of
 	# stray traffic on top of the UDP tunnel
-	ip netns exec $NS_SRC ping -q -c 1 $OL_NET$DST_NAT >/dev/null
+	ip netns exec $NS_SRC $PING -q -c 1 $OL_NET$DST_NAT >/dev/null
 	run_test "GRO fwd over UDP tunnel" $OL_NET$DST_NAT 1 1 $OL_NET$DST
 	cleanup
 
-- 
cgit v1.2.3


From f5c73297181c6b3ad76537bad98eaad6d29b9333 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Thu, 30 Dec 2021 20:12:31 -0800
Subject: userfaultfd/selftests: fix hugetlb area allocations

Currently, userfaultfd selftest for hugetlb as run from run_vmtests.sh
or any environment where there are 'just enough' hugetlb pages will
always fail with:

  testing events (fork, remap, remove):
		ERROR: UFFDIO_COPY error: -12 (errno=12, line=616)

The ENOMEM error code implies there are not enough hugetlb pages.
However, there are free hugetlb pages but they are all reserved.  There
is a basic problem with the way the test allocates hugetlb pages which
has existed since the test was originally written.

Due to the way 'cleanup' was done between different phases of the test,
this issue was masked until recently.  The issue was uncovered by commit
8ba6e8640844 ("userfaultfd/selftests: reinitialize test context in each
test").

For the hugetlb test, src and dst areas are allocated as PRIVATE
mappings of a hugetlb file.  This means that at mmap time, pages are
reserved for the src and dst areas.  At the start of event testing (and
other tests) the src area is populated which results in allocation of
huge pages to fill the area and consumption of reserves associated with
the area.  Then, a child is forked to fault in the dst area.  Note that
the dst area was allocated in the parent and hence the parent owns the
reserves associated with the mapping.  The child has normal access to
the dst area, but can not use the reserves created/owned by the parent.
Thus, if there are no other huge pages available allocation of a page
for the dst by the child will fail.

Fix by not creating reserves for the dst area.  In this way the child
can use free (non-reserved) pages.

Also, MAP_PRIVATE of a file only makes sense if you are interested in
the contents of the file before making a COW copy.  The test does not do
this.  So, just use MAP_ANONYMOUS | MAP_HUGETLB to create an anonymous
hugetlb mapping.  There is no need to create a hugetlb file in the
non-shared case.

Link: https://lkml.kernel.org/r/20211217172919.7861-1-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/vm/userfaultfd.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 8a09057d2f22..9354a5e0321c 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -87,7 +87,7 @@ static bool test_uffdio_minor = false;
 
 static bool map_shared;
 static int shm_fd;
-static int huge_fd;
+static int huge_fd = -1;	/* only used for hugetlb_shared test */
 static char *huge_fd_off0;
 static unsigned long long *count_verify;
 static int uffd = -1;
@@ -223,6 +223,9 @@ static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
 
 static void hugetlb_release_pages(char *rel_area)
 {
+	if (huge_fd == -1)
+		return;
+
 	if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
 		      rel_area == huge_fd_off0 ? 0 : nr_pages * page_size,
 		      nr_pages * page_size))
@@ -235,16 +238,17 @@ static void hugetlb_allocate_area(void **alloc_area)
 	char **alloc_area_alias;
 
 	*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
-			   (map_shared ? MAP_SHARED : MAP_PRIVATE) |
-			   MAP_HUGETLB,
-			   huge_fd, *alloc_area == area_src ? 0 :
-			   nr_pages * page_size);
+			   map_shared ? MAP_SHARED :
+			   MAP_PRIVATE | MAP_HUGETLB |
+			   (*alloc_area == area_src ? 0 : MAP_NORESERVE),
+			   huge_fd,
+			   *alloc_area == area_src ? 0 : nr_pages * page_size);
 	if (*alloc_area == MAP_FAILED)
 		err("mmap of hugetlbfs file failed");
 
 	if (map_shared) {
 		area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
-				  MAP_SHARED | MAP_HUGETLB,
+				  MAP_SHARED,
 				  huge_fd, *alloc_area == area_src ? 0 :
 				  nr_pages * page_size);
 		if (area_alias == MAP_FAILED)
-- 
cgit v1.2.3


From cef5cd25a453805237f0c4e789218d987d674290 Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.ibm.com>
Date: Wed, 24 Nov 2021 12:38:00 +0530
Subject: selftest/kexec: fix "ignored null byte in input" warning

Instead of assigning the string to a variable, which might contain a
null character, redirect the output and grep for the string directly.

Reviewed-by: Petr Vorel <pvorel@suse.cz>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 tools/testing/selftests/kexec/test_kexec_file_load.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kexec/test_kexec_file_load.sh b/tools/testing/selftests/kexec/test_kexec_file_load.sh
index 2ff600388c30..99f6fc23ee31 100755
--- a/tools/testing/selftests/kexec/test_kexec_file_load.sh
+++ b/tools/testing/selftests/kexec/test_kexec_file_load.sh
@@ -97,10 +97,11 @@ check_for_imasig()
 check_for_modsig()
 {
 	local module_sig_string="~Module signature appended~"
-	local sig="$(tail --bytes $((${#module_sig_string} + 1)) $KERNEL_IMAGE)"
 	local ret=0
 
-	if [ "$sig" == "$module_sig_string" ]; then
+	tail --bytes $((${#module_sig_string} + 1)) $KERNEL_IMAGE | \
+		grep -q "$module_sig_string"
+	if [ $? -eq 0 ]; then
 		ret=1
 		log_info "kexec kernel image modsig signed"
 	else
-- 
cgit v1.2.3


From 9be6dc8059bb235ffb0666690a26c0df15cc6d9e Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.ibm.com>
Date: Fri, 3 Dec 2021 10:20:42 -0500
Subject: selftests/kexec: update searching for the Kconfig

First check /lib/modules/`uname -r`/config, before using the IKCONFIG.
In addition, the configs.ko might be compressed.  Fix the configs.ko
name.

Reviewed-by: Petr Vorel <pvorel@suse.cz>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 tools/testing/selftests/kexec/kexec_common_lib.sh | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kexec/kexec_common_lib.sh b/tools/testing/selftests/kexec/kexec_common_lib.sh
index 43017cfe88f7..5a1b8ae04c64 100755
--- a/tools/testing/selftests/kexec/kexec_common_lib.sh
+++ b/tools/testing/selftests/kexec/kexec_common_lib.sh
@@ -138,15 +138,20 @@ kconfig_enabled()
 	return 0
 }
 
-# Attempt to get the kernel config first via proc, and then by
-# extracting it from the kernel image or the configs.ko using
-# scripts/extract-ikconfig.
+# Attempt to get the kernel config first by checking the modules directory
+# then via proc, and finally by extracting it from the kernel image or the
+# configs.ko using scripts/extract-ikconfig.
 # Return 1 for found.
 get_kconfig()
 {
 	local proc_config="/proc/config.gz"
 	local module_dir="/lib/modules/`uname -r`"
-	local configs_module="$module_dir/kernel/kernel/configs.ko"
+	local configs_module="$module_dir/kernel/kernel/configs.ko*"
+
+	if [ -f $module_dir/config ]; then
+		IKCONFIG=$module_dir/config
+		return 1
+	fi
 
 	if [ ! -f $proc_config ]; then
 		modprobe configs > /dev/null 2>&1
-- 
cgit v1.2.3


From 5e75d0b215b868337e7a193f28a543ec00e858b1 Mon Sep 17 00:00:00 2001
From: Jianguo Wu <wujianguo@chinatelecom.cn>
Date: Fri, 31 Dec 2021 10:01:08 +0800
Subject: selftests: net: udpgro_fwd.sh: explicitly checking the available ping
 feature

As Paolo pointed out, the result of ping IPv6 address depends on
the running distro. So explicitly checking the available ping feature,
as e.g. do the bareudp.sh self-tests.

Fixes: 8b3170e07539 ("selftests: net: using ping6 for IPv6 in udpgro_fwd.sh")
Signed-off-by: Jianguo Wu <wujianguo@chinatelecom.cn>
Link: https://lore.kernel.org/r/825ee22b-4245-dbf7-d2f7-a230770d6e21@163.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/udpgro_fwd.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/udpgro_fwd.sh b/tools/testing/selftests/net/udpgro_fwd.sh
index 3ea73013d956..6f05e06f6761 100755
--- a/tools/testing/selftests/net/udpgro_fwd.sh
+++ b/tools/testing/selftests/net/udpgro_fwd.sh
@@ -193,7 +193,8 @@ for family in 4 6; do
 		SUFFIX="64 nodad"
 		VXDEV=vxlan6
 		IPT=ip6tables
-		PING="ping6"
+		# Use ping6 on systems where ping doesn't handle IPv6
+		ping -w 1 -c 1 ::1 > /dev/null 2>&1 || PING="ping6"
 	fi
 
 	echo "IPv$family"
-- 
cgit v1.2.3


From e0257a01d6689c273a019756ed5e13911cc1bfed Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Wed, 22 Dec 2021 00:11:30 +0800
Subject: perf pmu: Fix alias events list

Commit 0e0ae8742207c3b4 ("perf list: Display hybrid PMU events with cpu
type") changes the event list for uncore PMUs or arm64 heterogeneous CPU
systems, such that duplicate aliases are incorrectly listed per PMU
(which they should not be), like:

  # perf list
  ...
  unc_cbo_cache_lookup.any_es
  [Unit: uncore_cbox L3 Lookup any request that access cache and found
  line in E or S-state]
  unc_cbo_cache_lookup.any_es
  [Unit: uncore_cbox L3 Lookup any request that access cache and found
  line in E or S-state]
  unc_cbo_cache_lookup.any_i
  [Unit: uncore_cbox L3 Lookup any request that access cache and found
  line in I-state]
  unc_cbo_cache_lookup.any_i
  [Unit: uncore_cbox L3 Lookup any request that access cache and found
  line in I-state]
  ...

Notice how the events are listed twice.

The named commit changed how we remove duplicate events, in that events
for different PMUs are not treated as duplicates. I suppose this is to
handle how "Each hybrid pmu event has been assigned with a pmu name".

Fix PMU alias listing by restoring behaviour to remove duplicates for
non-hybrid PMUs.

Fixes: 0e0ae8742207c3b4 ("perf list: Display hybrid PMU events with cpu type")
Signed-off-by: John Garry <john.garry@huawei.com>
Tested-by: Zhengjun Xing <zhengjun.xing@linux.intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/1640103090-140490-1-git-send-email-john.garry@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/pmu.c | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 6ae58406f4fc..8dfbba15aeb8 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -1659,6 +1659,21 @@ bool is_pmu_core(const char *name)
 	return !strcmp(name, "cpu") || is_arm_pmu_core(name);
 }
 
+static bool pmu_alias_is_duplicate(struct sevent *alias_a,
+				   struct sevent *alias_b)
+{
+	/* Different names -> never duplicates */
+	if (strcmp(alias_a->name, alias_b->name))
+		return false;
+
+	/* Don't remove duplicates for hybrid PMUs */
+	if (perf_pmu__is_hybrid(alias_a->pmu) &&
+	    perf_pmu__is_hybrid(alias_b->pmu))
+		return false;
+
+	return true;
+}
+
 void print_pmu_events(const char *event_glob, bool name_only, bool quiet_flag,
 			bool long_desc, bool details_flag, bool deprecated,
 			const char *pmu_name)
@@ -1744,12 +1759,8 @@ void print_pmu_events(const char *event_glob, bool name_only, bool quiet_flag,
 	qsort(aliases, len, sizeof(struct sevent), cmp_sevent);
 	for (j = 0; j < len; j++) {
 		/* Skip duplicates */
-		if (j > 0 && !strcmp(aliases[j].name, aliases[j - 1].name)) {
-			if (!aliases[j].pmu || !aliases[j - 1].pmu ||
-			    !strcmp(aliases[j].pmu, aliases[j - 1].pmu)) {
-				continue;
-			}
-		}
+		if (j > 0 && pmu_alias_is_duplicate(&aliases[j], &aliases[j - 1]))
+			continue;
 
 		if (name_only) {
 			printf("%s ", aliases[j].name);
-- 
cgit v1.2.3


From 64f18d2d043015b3f835ce4c9f3beb97cfd19b6e Mon Sep 17 00:00:00 2001
From: yaowenbin <yaowenbin1@huawei.com>
Date: Wed, 29 Dec 2021 16:55:19 +0800
Subject: perf top: Fix TUI exit screen refresh race condition

When the following command is executed several times, a coredump file is
generated.

	$ timeout -k 9 5 perf top -e task-clock
	*******
	*******
	*******
	0.01%  [kernel]                  [k] __do_softirq
	0.01%  libpthread-2.28.so        [.] __pthread_mutex_lock
	0.01%  [kernel]                  [k] __ll_sc_atomic64_sub_return
	double free or corruption (!prev) perf top --sort comm,dso
	timeout: the monitored command dumped core

When we terminate "perf top" using sending signal method,
SLsmg_reset_smg() called. SLsmg_reset_smg() resets the SLsmg screen
management routines by freeing all memory allocated while it was active.

However SLsmg_reinit_smg() maybe be called by another thread.

SLsmg_reinit_smg() will free the same memory accessed by
SLsmg_reset_smg(), thus it results in a double free.

SLsmg_reinit_smg() is called already protected by ui__lock, so we fix
the problem by adding pthread_mutex_trylock of ui__lock when calling
SLsmg_reset_smg().

Signed-off-by: Wenyu Liu <liuwenyu7@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: wuxu.wu@huawei.com
Link: http://lore.kernel.org/lkml/a91e3943-7ddc-f5c0-a7f5-360f073c20e6@huawei.com
Signed-off-by: Hewenliang <hewenliang4@huawei.com>
Signed-off-by: yaowenbin <yaowenbin1@huawei.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/ui/tui/setup.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/ui/tui/setup.c b/tools/perf/ui/tui/setup.c
index e9bfe856a5de..b1be59b4e2a4 100644
--- a/tools/perf/ui/tui/setup.c
+++ b/tools/perf/ui/tui/setup.c
@@ -170,9 +170,11 @@ void ui__exit(bool wait_for_ok)
 				    "Press any key...", 0);
 
 	SLtt_set_cursor_visibility(1);
-	SLsmg_refresh();
-	SLsmg_reset_smg();
+	if (!pthread_mutex_trylock(&ui__lock)) {
+		SLsmg_refresh();
+		SLsmg_reset_smg();
+		pthread_mutex_unlock(&ui__lock);
+	}
 	SLang_reset_tty();
-
 	perf_error__unregister(&perf_tui_eops);
 }
-- 
cgit v1.2.3


From f15dcf1b5853e770ed614e23ab2e90f0ca1b7b0b Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 28 Dec 2021 12:14:14 +0000
Subject: KVM: arm64: selftests: get-reg-list: Add pauth configuration

The get-reg-list test ignores the Pointer Authentication features,
which is a shame now that we have relatively common HW with this feature.

Define two new configurations (with and without PMU) that exercise the
KVM capabilities.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Andrew Jones <drjones@redhat.com>
Link: https://lore.kernel.org/r/20211228121414.1013250-1-maz@kernel.org
---
 tools/testing/selftests/kvm/aarch64/get-reg-list.c | 50 ++++++++++++++++++++++
 1 file changed, 50 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c
index cc898181faab..f769fc6cd927 100644
--- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c
+++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c
@@ -1014,6 +1014,22 @@ static __u64 sve_rejects_set[] = {
 	KVM_REG_ARM64_SVE_VLS,
 };
 
+static __u64 pauth_addr_regs[] = {
+	ARM64_SYS_REG(3, 0, 2, 1, 0),	/* APIAKEYLO_EL1 */
+	ARM64_SYS_REG(3, 0, 2, 1, 1),	/* APIAKEYHI_EL1 */
+	ARM64_SYS_REG(3, 0, 2, 1, 2),	/* APIBKEYLO_EL1 */
+	ARM64_SYS_REG(3, 0, 2, 1, 3),	/* APIBKEYHI_EL1 */
+	ARM64_SYS_REG(3, 0, 2, 2, 0),	/* APDAKEYLO_EL1 */
+	ARM64_SYS_REG(3, 0, 2, 2, 1),	/* APDAKEYHI_EL1 */
+	ARM64_SYS_REG(3, 0, 2, 2, 2),	/* APDBKEYLO_EL1 */
+	ARM64_SYS_REG(3, 0, 2, 2, 3)	/* APDBKEYHI_EL1 */
+};
+
+static __u64 pauth_generic_regs[] = {
+	ARM64_SYS_REG(3, 0, 2, 3, 0),	/* APGAKEYLO_EL1 */
+	ARM64_SYS_REG(3, 0, 2, 3, 1),	/* APGAKEYHI_EL1 */
+};
+
 #define BASE_SUBLIST \
 	{ "base", .regs = base_regs, .regs_n = ARRAY_SIZE(base_regs), }
 #define VREGS_SUBLIST \
@@ -1025,6 +1041,21 @@ static __u64 sve_rejects_set[] = {
 	{ "sve", .capability = KVM_CAP_ARM_SVE, .feature = KVM_ARM_VCPU_SVE, .finalize = true, \
 	  .regs = sve_regs, .regs_n = ARRAY_SIZE(sve_regs), \
 	  .rejects_set = sve_rejects_set, .rejects_set_n = ARRAY_SIZE(sve_rejects_set), }
+#define PAUTH_SUBLIST							\
+	{								\
+		.name 		= "pauth_address",			\
+		.capability	= KVM_CAP_ARM_PTRAUTH_ADDRESS,		\
+		.feature	= KVM_ARM_VCPU_PTRAUTH_ADDRESS,		\
+		.regs		= pauth_addr_regs,			\
+		.regs_n		= ARRAY_SIZE(pauth_addr_regs),		\
+	},								\
+	{								\
+		.name 		= "pauth_generic",			\
+		.capability	= KVM_CAP_ARM_PTRAUTH_GENERIC,		\
+		.feature	= KVM_ARM_VCPU_PTRAUTH_GENERIC,		\
+		.regs		= pauth_generic_regs,			\
+		.regs_n		= ARRAY_SIZE(pauth_generic_regs),	\
+	}
 
 static struct vcpu_config vregs_config = {
 	.sublists = {
@@ -1056,11 +1087,30 @@ static struct vcpu_config sve_pmu_config = {
 	{0},
 	},
 };
+static struct vcpu_config pauth_config = {
+	.sublists = {
+	BASE_SUBLIST,
+	VREGS_SUBLIST,
+	PAUTH_SUBLIST,
+	{0},
+	},
+};
+static struct vcpu_config pauth_pmu_config = {
+	.sublists = {
+	BASE_SUBLIST,
+	VREGS_SUBLIST,
+	PAUTH_SUBLIST,
+	PMU_SUBLIST,
+	{0},
+	},
+};
 
 static struct vcpu_config *vcpu_configs[] = {
 	&vregs_config,
 	&vregs_pmu_config,
 	&sve_config,
 	&sve_pmu_config,
+	&pauth_config,
+	&pauth_pmu_config,
 };
 static int vcpu_configs_n = ARRAY_SIZE(vcpu_configs);
-- 
cgit v1.2.3


From b22bf1b9979a608827dea98c61ed9ec297bcc513 Mon Sep 17 00:00:00 2001
From: Paul Chaignon <paul@isovalent.com>
Date: Tue, 4 Jan 2022 18:59:29 +0100
Subject: bpftool: Refactor misc. feature probe

There is currently a single miscellaneous feature probe,
HAVE_LARGE_INSN_LIMIT, to check for the 1M instructions limit in the
verifier. Subsequent patches will add additional miscellaneous probes,
which follow the same pattern at the existing probe. This patch
therefore refactors the probe to avoid code duplication in subsequent
patches.

The BPF program type and the checked error numbers in the
HAVE_LARGE_INSN_LIMIT probe are changed to better generalize to other
probes. The feature probe retains its current behavior despite those
changes.

Signed-off-by: Paul Chaignon <paul@isovalent.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Quentin Monnet <quentin@isovalent.com>
Link: https://lore.kernel.org/bpf/956c9329a932c75941194f91790d01f31dfbe01b.1641314075.git.paul@isovalent.com
---
 tools/bpf/bpftool/feature.c | 45 +++++++++++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 16 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c
index 6719b9282eca..3da97a02f455 100644
--- a/tools/bpf/bpftool/feature.c
+++ b/tools/bpf/bpftool/feature.c
@@ -642,6 +642,30 @@ probe_helpers_for_progtype(enum bpf_prog_type prog_type, bool supported_type,
 		printf("\n");
 }
 
+static void
+probe_misc_feature(struct bpf_insn *insns, size_t len,
+		   const char *define_prefix, __u32 ifindex,
+		   const char *feat_name, const char *plain_name,
+		   const char *define_name)
+{
+	LIBBPF_OPTS(bpf_prog_load_opts, opts,
+		.prog_ifindex = ifindex,
+	);
+	bool res;
+	int fd;
+
+	errno = 0;
+	fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL",
+			   insns, len, &opts);
+	res = fd >= 0 || !errno;
+
+	if (fd >= 0)
+		close(fd);
+
+	print_bool_feature(feat_name, plain_name, define_name, res,
+			   define_prefix);
+}
+
 /*
  * Probe for availability of kernel commit (5.3):
  *
@@ -649,29 +673,18 @@ probe_helpers_for_progtype(enum bpf_prog_type prog_type, bool supported_type,
  */
 static void probe_large_insn_limit(const char *define_prefix, __u32 ifindex)
 {
-	LIBBPF_OPTS(bpf_prog_load_opts, opts,
-		.prog_ifindex = ifindex,
-	);
 	struct bpf_insn insns[BPF_MAXINSNS + 1];
-	bool res;
-	int i, fd;
+	int i;
 
 	for (i = 0; i < BPF_MAXINSNS; i++)
 		insns[i] = BPF_MOV64_IMM(BPF_REG_0, 1);
 	insns[BPF_MAXINSNS] = BPF_EXIT_INSN();
 
-	errno = 0;
-	fd = bpf_prog_load(BPF_PROG_TYPE_SCHED_CLS, NULL, "GPL",
-			   insns, ARRAY_SIZE(insns), &opts);
-	res = fd >= 0 || (errno != E2BIG && errno != EINVAL);
-
-	if (fd >= 0)
-		close(fd);
-
-	print_bool_feature("have_large_insn_limit",
+	probe_misc_feature(insns, ARRAY_SIZE(insns),
+			   define_prefix, ifindex,
+			   "have_large_insn_limit",
 			   "Large program size limit",
-			   "LARGE_INSN_LIMIT",
-			   res, define_prefix);
+			   "LARGE_INSN_LIMIT");
 }
 
 static void
-- 
cgit v1.2.3


From c04fb2b0bd9275969be3b0a95f9c3ef76b1bfb73 Mon Sep 17 00:00:00 2001
From: Paul Chaignon <paul@isovalent.com>
Date: Tue, 4 Jan 2022 18:59:57 +0100
Subject: bpftool: Probe for bounded loop support

This patch introduces a new probe to check whether the verifier supports
bounded loops as introduced in commit 2589726d12a1 ("bpf: introduce
bounded loops"). This patch will allow BPF users such as Cilium to probe
for loop support on startup and only unconditionally unroll loops on
older kernels.

The results are displayed as part of the miscellaneous section, as shown
below.

  $ bpftool feature probe | grep loops
  Bounded loop support is available
  $ bpftool feature probe macro | grep LOOPS
  #define HAVE_BOUNDED_LOOPS
  $ bpftool feature probe -j | jq .misc
  {
    "have_large_insn_limit": true,
    "have_bounded_loops": true
  }

Signed-off-by: Paul Chaignon <paul@isovalent.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Quentin Monnet <quentin@isovalent.com>
Link: https://lore.kernel.org/bpf/f7807c0b27d79f48e71de7b5a99c680ca4bd0151.1641314075.git.paul@isovalent.com
---
 tools/bpf/bpftool/feature.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c
index 3da97a02f455..03579d113042 100644
--- a/tools/bpf/bpftool/feature.c
+++ b/tools/bpf/bpftool/feature.c
@@ -687,6 +687,27 @@ static void probe_large_insn_limit(const char *define_prefix, __u32 ifindex)
 			   "LARGE_INSN_LIMIT");
 }
 
+/*
+ * Probe for bounded loop support introduced in commit 2589726d12a1
+ * ("bpf: introduce bounded loops").
+ */
+static void
+probe_bounded_loops(const char *define_prefix, __u32 ifindex)
+{
+	struct bpf_insn insns[4] = {
+		BPF_MOV64_IMM(BPF_REG_0, 10),
+		BPF_ALU64_IMM(BPF_SUB, BPF_REG_0, 1),
+		BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, -2),
+		BPF_EXIT_INSN()
+	};
+
+	probe_misc_feature(insns, ARRAY_SIZE(insns),
+			   define_prefix, ifindex,
+			   "have_bounded_loops",
+			   "Bounded loop support",
+			   "BOUNDED_LOOPS");
+}
+
 static void
 section_system_config(enum probe_component target, const char *define_prefix)
 {
@@ -801,6 +822,7 @@ static void section_misc(const char *define_prefix, __u32 ifindex)
 			    "/*** eBPF misc features ***/",
 			    define_prefix);
 	probe_large_insn_limit(define_prefix, ifindex);
+	probe_bounded_loops(define_prefix, ifindex);
 	print_end_section();
 }
 
-- 
cgit v1.2.3


From 0fd800b2456cf90ed738a1260b53acaa8843b5ae Mon Sep 17 00:00:00 2001
From: Paul Chaignon <paul@isovalent.com>
Date: Tue, 4 Jan 2022 19:00:13 +0100
Subject: bpftool: Probe for instruction set extensions

This patch introduces new probes to check whether the kernel supports
instruction set extensions v2 and v3. The first introduced eBPF
instructions BPF_J{LT,LE,SLT,SLE} in commit 92b31a9af73b ("bpf: add
BPF_J{LT,LE,SLT,SLE} instructions"). The second introduces 32-bit
variants of all jump instructions in commit 092ed0968bb6 ("bpf:
verifier support JMP32").

These probes are useful for userspace BPF projects that want to use newer
instruction set extensions on newer kernels, to reduce the programs'
sizes or their complexity. LLVM already provides an mcpu=probe option to
automatically probe the kernel and select the newest-supported
instruction set extension. That is however not flexible enough for all
use cases. For example, in Cilium, we only want to use the v3
instruction set extension on v5.10+, even though it is supported on all
kernels v5.1+.

Signed-off-by: Paul Chaignon <paul@isovalent.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Quentin Monnet <quentin@isovalent.com>
Link: https://lore.kernel.org/bpf/3bfedcd9898c1f41ac67ca61f144fec84c6c3a92.1641314075.git.paul@isovalent.com
---
 tools/bpf/bpftool/feature.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c
index 03579d113042..e999159fa28d 100644
--- a/tools/bpf/bpftool/feature.c
+++ b/tools/bpf/bpftool/feature.c
@@ -708,6 +708,48 @@ probe_bounded_loops(const char *define_prefix, __u32 ifindex)
 			   "BOUNDED_LOOPS");
 }
 
+/*
+ * Probe for the v2 instruction set extension introduced in commit 92b31a9af73b
+ * ("bpf: add BPF_J{LT,LE,SLT,SLE} instructions").
+ */
+static void
+probe_v2_isa_extension(const char *define_prefix, __u32 ifindex)
+{
+	struct bpf_insn insns[4] = {
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_JMP_IMM(BPF_JLT, BPF_REG_0, 0, 1),
+		BPF_MOV64_IMM(BPF_REG_0, 1),
+		BPF_EXIT_INSN()
+	};
+
+	probe_misc_feature(insns, ARRAY_SIZE(insns),
+			   define_prefix, ifindex,
+			   "have_v2_isa_extension",
+			   "ISA extension v2",
+			   "V2_ISA_EXTENSION");
+}
+
+/*
+ * Probe for the v3 instruction set extension introduced in commit 092ed0968bb6
+ * ("bpf: verifier support JMP32").
+ */
+static void
+probe_v3_isa_extension(const char *define_prefix, __u32 ifindex)
+{
+	struct bpf_insn insns[4] = {
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_JMP32_IMM(BPF_JLT, BPF_REG_0, 0, 1),
+		BPF_MOV64_IMM(BPF_REG_0, 1),
+		BPF_EXIT_INSN()
+	};
+
+	probe_misc_feature(insns, ARRAY_SIZE(insns),
+			   define_prefix, ifindex,
+			   "have_v3_isa_extension",
+			   "ISA extension v3",
+			   "V3_ISA_EXTENSION");
+}
+
 static void
 section_system_config(enum probe_component target, const char *define_prefix)
 {
@@ -823,6 +865,8 @@ static void section_misc(const char *define_prefix, __u32 ifindex)
 			    define_prefix);
 	probe_large_insn_limit(define_prefix, ifindex);
 	probe_bounded_loops(define_prefix, ifindex);
+	probe_v2_isa_extension(define_prefix, ifindex);
+	probe_v3_isa_extension(define_prefix, ifindex);
 	print_end_section();
 }
 
-- 
cgit v1.2.3


From 5e22dd18626726028a93ff1350a8a71a00fd843d Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Tue, 4 Jan 2022 13:10:30 +0100
Subject: bpf/selftests: Fix namespace mount setup in tc_redirect

The tc_redirect umounts /sys in the new namespace, which can be
mounted as shared and cause global umount. The lazy umount also
takes down mounted trees under /sys like debugfs, which won't be
available after sysfs mounts again and could cause fails in other
tests.

  # cat /proc/self/mountinfo | grep debugfs
  34 23 0:7 / /sys/kernel/debug rw,nosuid,nodev,noexec,relatime shared:14 - debugfs debugfs rw
  # cat /proc/self/mountinfo | grep sysfs
  23 86 0:22 / /sys rw,nosuid,nodev,noexec,relatime shared:2 - sysfs sysfs rw
  # mount | grep debugfs
  debugfs on /sys/kernel/debug type debugfs (rw,nosuid,nodev,noexec,relatime)

  # ./test_progs -t tc_redirect
  #164 tc_redirect:OK
  Summary: 1/4 PASSED, 0 SKIPPED, 0 FAILED

  # mount | grep debugfs
  # cat /proc/self/mountinfo | grep debugfs
  # cat /proc/self/mountinfo | grep sysfs
  25 86 0:22 / /sys rw,relatime shared:2 - sysfs sysfs rw

Making the sysfs private under the new namespace so the umount won't
trigger the global sysfs umount.

Reported-by: Hangbin Liu <haliu@redhat.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jussi Maki <joamaki@gmail.com>
Link: https://lore.kernel.org/bpf/20220104121030.138216-1-jolsa@kernel.org
---
 tools/testing/selftests/bpf/prog_tests/tc_redirect.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
index 4b18b73df10b..c2426df58e17 100644
--- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
+++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
@@ -105,6 +105,13 @@ static int setns_by_fd(int nsfd)
 	if (!ASSERT_OK(err, "unshare"))
 		return err;
 
+	/* Make our /sys mount private, so the following umount won't
+	 * trigger the global umount in case it's shared.
+	 */
+	err = mount("none", "/sys", NULL, MS_PRIVATE, NULL);
+	if (!ASSERT_OK(err, "remount private /sys"))
+		return err;
+
 	err = umount2("/sys", MNT_DETACH);
 	if (!ASSERT_OK(err, "umount2 /sys"))
 		return err;
-- 
cgit v1.2.3


From 65e38e32a959dbbb0bf5cf1ae699789f81759be6 Mon Sep 17 00:00:00 2001
From: Nageswara R Sastry <rnsastry@linux.ibm.com>
Date: Wed, 24 Nov 2021 12:38:01 +0530
Subject: selftests/kexec: Enable secureboot tests for PowerPC

Existing test cases determine secureboot state using efi variable, which
is available only on x86 architecture.  Add support for determining
secureboot state using device tree property on PowerNV architecture.

Signed-off-by: Nageswara R Sastry <rnsastry@linux.ibm.com>
Reviewed-by: Nayna Jain <nayna@linux.ibm.com>
Tested-by: Nayna Jain <nayna@linux.ibm.com>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 tools/testing/selftests/kexec/Makefile             |  2 +-
 tools/testing/selftests/kexec/kexec_common_lib.sh  | 38 ++++++++++++++++++----
 .../selftests/kexec/test_kexec_file_load.sh        |  8 +++--
 3 files changed, 39 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kexec/Makefile b/tools/testing/selftests/kexec/Makefile
index aa91d2063249..806a150648c3 100644
--- a/tools/testing/selftests/kexec/Makefile
+++ b/tools/testing/selftests/kexec/Makefile
@@ -4,7 +4,7 @@
 uname_M := $(shell uname -m 2>/dev/null || echo not)
 ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/)
 
-ifeq ($(ARCH),x86)
+ifeq ($(ARCH),$(filter $(ARCH),x86 ppc64le))
 TEST_PROGS := test_kexec_load.sh test_kexec_file_load.sh
 TEST_FILES := kexec_common_lib.sh
 
diff --git a/tools/testing/selftests/kexec/kexec_common_lib.sh b/tools/testing/selftests/kexec/kexec_common_lib.sh
index 5a1b8ae04c64..0e114b34d5d7 100755
--- a/tools/testing/selftests/kexec/kexec_common_lib.sh
+++ b/tools/testing/selftests/kexec/kexec_common_lib.sh
@@ -91,6 +91,27 @@ get_efi_var_secureboot_mode()
 	return 0;
 }
 
+# On powerpc platform, check device-tree property
+# /proc/device-tree/ibm,secureboot/os-secureboot-enforcing
+# to detect secureboot state.
+get_ppc64_secureboot_mode()
+{
+	local secure_boot_file="/proc/device-tree/ibm,secureboot/os-secureboot-enforcing"
+	# Check for secure boot file existence
+	if [ -f $secure_boot_file ]; then
+		log_info "Secureboot is enabled (Device tree)"
+		return 1;
+	fi
+	log_info "Secureboot is not enabled (Device tree)"
+	return 0;
+}
+
+# Return the architecture of the system
+get_arch()
+{
+	echo $(arch)
+}
+
 # Check efivar SecureBoot-$(the UUID) and SetupMode-$(the UUID).
 # The secure boot mode can be accessed either as the last integer
 # of "od -An -t u1 /sys/firmware/efi/efivars/SecureBoot-*" or from
@@ -100,14 +121,19 @@ get_efi_var_secureboot_mode()
 get_secureboot_mode()
 {
 	local secureboot_mode=0
+	local system_arch=$(get_arch)
 
-	get_efivarfs_secureboot_mode
-	secureboot_mode=$?
-
-	# fallback to using the efi_var files
-	if [ $secureboot_mode -eq 0 ]; then
-		get_efi_var_secureboot_mode
+	if [ "$system_arch" == "ppc64le" ]; then
+		get_ppc64_secureboot_mode
 		secureboot_mode=$?
+	else
+		get_efivarfs_secureboot_mode
+		secureboot_mode=$?
+		# fallback to using the efi_var files
+		if [ $secureboot_mode -eq 0 ]; then
+			get_efi_var_secureboot_mode
+			secureboot_mode=$?
+		fi
 	fi
 
 	if [ $secureboot_mode -eq 0 ]; then
diff --git a/tools/testing/selftests/kexec/test_kexec_file_load.sh b/tools/testing/selftests/kexec/test_kexec_file_load.sh
index 99f6fc23ee31..c9ccb3c93d72 100755
--- a/tools/testing/selftests/kexec/test_kexec_file_load.sh
+++ b/tools/testing/selftests/kexec/test_kexec_file_load.sh
@@ -226,8 +226,12 @@ get_secureboot_mode
 secureboot=$?
 
 # Are there pe and ima signatures
-check_for_pesig
-pe_signed=$?
+if [ "$(get_arch)" == 'ppc64le' ]; then
+	pe_signed=0
+else
+	check_for_pesig
+	pe_signed=$?
+fi
 
 check_for_imasig
 ima_signed=$?
-- 
cgit v1.2.3


From db54c12a3d7e3eedd37aa08efc9362e905f07716 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Wed, 5 Jan 2022 14:44:36 +0000
Subject: selftests: set amt.sh executable

amt.sh test script will not work because it doesn't have execution
permission. So, it adds execution permission.

Reported-by: Hangbin Liu <liuhangbin@gmail.com>
Fixes: c08e8baea78e ("selftests: add amt interface selftest script")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Link: https://lore.kernel.org/r/20220105144436.13415-1-ap420073@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/amt.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 tools/testing/selftests/net/amt.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/net/amt.sh b/tools/testing/selftests/net/amt.sh
old mode 100644
new mode 100755
-- 
cgit v1.2.3


From ca796fe66f7fceff17679ee6cc5fe4b4023de44d Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 5 Jan 2022 11:33:34 -0800
Subject: bpf, selftests: Add verifier test for mem_or_null register with
 offset.

Add a new test case with mem_or_null typed register with off > 0 to ensure
it gets rejected by the verifier:

  # ./test_verifier 1011
  #1009/u check with invalid reg offset 0 OK
  #1009/p check with invalid reg offset 0 OK
  Summary: 2 PASSED, 0 SKIPPED, 0 FAILED

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/verifier/spill_fill.c | 28 +++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/verifier/spill_fill.c b/tools/testing/selftests/bpf/verifier/spill_fill.c
index 6c907144311f..1a8eb9672bd1 100644
--- a/tools/testing/selftests/bpf/verifier/spill_fill.c
+++ b/tools/testing/selftests/bpf/verifier/spill_fill.c
@@ -58,6 +58,34 @@
 	.result = ACCEPT,
 	.result_unpriv = ACCEPT,
 },
+{
+	"check with invalid reg offset 0",
+	.insns = {
+	/* reserve 8 byte ringbuf memory */
+	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+	BPF_LD_MAP_FD(BPF_REG_1, 0),
+	BPF_MOV64_IMM(BPF_REG_2, 8),
+	BPF_MOV64_IMM(BPF_REG_3, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve),
+	/* store a pointer to the reserved memory in R6 */
+	BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+	/* add invalid offset to memory or NULL */
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1),
+	/* check whether the reservation was successful */
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+	/* should not be able to access *(R7) = 0 */
+	BPF_ST_MEM(BPF_W, BPF_REG_6, 0, 0),
+	/* submit the reserved ringbuf memory */
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+	BPF_MOV64_IMM(BPF_REG_2, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_submit),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_map_ringbuf = { 1 },
+	.result = REJECT,
+	.errstr = "R0 pointer arithmetic on mem_or_null prohibited",
+},
 {
 	"check corrupted spill/fill",
 	.insns = {
-- 
cgit v1.2.3


From 7218c28c87f57c131879a75a226b9033ac90b266 Mon Sep 17 00:00:00 2001
From: Christy Lee <christylee@fb.com>
Date: Wed, 29 Dec 2021 12:41:56 -0800
Subject: libbpf: Deprecate bpf_perf_event_read_simple() API

With perf_buffer__poll() and perf_buffer__consume() APIs available,
there is no reason to expose bpf_perf_event_read_simple() API to
users. If users need custom perf buffer, they could re-implement
the function.

Mark bpf_perf_event_read_simple() and move the logic to a new
static function so it can still be called by other functions in the
same file.

  [0] Closes: https://github.com/libbpf/libbpf/issues/310

Signed-off-by: Christy Lee <christylee@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211229204156.13569-1-christylee@fb.com
---
 tools/lib/bpf/libbpf.c | 22 ++++++++++++++--------
 tools/lib/bpf/libbpf.h |  1 +
 2 files changed, 15 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 9cb99d1e2385..1d02ba7f11b4 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -10676,10 +10676,10 @@ struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map)
 	return link;
 }
 
-enum bpf_perf_event_ret
-bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size,
-			   void **copy_mem, size_t *copy_size,
-			   bpf_perf_event_print_t fn, void *private_data)
+static enum bpf_perf_event_ret
+perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size,
+		       void **copy_mem, size_t *copy_size,
+		       bpf_perf_event_print_t fn, void *private_data)
 {
 	struct perf_event_mmap_page *header = mmap_mem;
 	__u64 data_head = ring_buffer_read_head(header);
@@ -10724,6 +10724,12 @@ bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size,
 	return libbpf_err(ret);
 }
 
+__attribute__((alias("perf_event_read_simple")))
+enum bpf_perf_event_ret
+bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size,
+			   void **copy_mem, size_t *copy_size,
+			   bpf_perf_event_print_t fn, void *private_data);
+
 struct perf_buffer;
 
 struct perf_buffer_params {
@@ -11132,10 +11138,10 @@ static int perf_buffer__process_records(struct perf_buffer *pb,
 {
 	enum bpf_perf_event_ret ret;
 
-	ret = bpf_perf_event_read_simple(cpu_buf->base, pb->mmap_size,
-					 pb->page_size, &cpu_buf->buf,
-					 &cpu_buf->buf_size,
-					 perf_buffer__process_record, cpu_buf);
+	ret = perf_event_read_simple(cpu_buf->base, pb->mmap_size,
+				     pb->page_size, &cpu_buf->buf,
+				     &cpu_buf->buf_size,
+				     perf_buffer__process_record, cpu_buf);
 	if (ret != LIBBPF_PERF_EVENT_CONT)
 		return ret;
 	return 0;
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 85dfef88b3d2..ddf1cc9e7803 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -1026,6 +1026,7 @@ LIBBPF_API int perf_buffer__buffer_fd(const struct perf_buffer *pb, size_t buf_i
 typedef enum bpf_perf_event_ret
 	(*bpf_perf_event_print_t)(struct perf_event_header *hdr,
 				  void *private_data);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use perf_buffer__poll() or  perf_buffer__consume() instead")
 LIBBPF_API enum bpf_perf_event_ret
 bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size,
 			   void **copy_mem, size_t *copy_size,
-- 
cgit v1.2.3


From 71cff670baff5cc6a6eeb0181e2cc55579c5e1e0 Mon Sep 17 00:00:00 2001
From: Qiang Wang <wangqiang.wq.frank@bytedance.com>
Date: Mon, 27 Dec 2021 21:07:12 +0800
Subject: libbpf: Use probe_name for legacy kprobe

Fix a bug in commit 46ed5fc33db9, which wrongly used the
func_name instead of probe_name to register legacy kprobe.

Fixes: 46ed5fc33db9 ("libbpf: Refactor and simplify legacy kprobe code")
Co-developed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Qiang Wang <wangqiang.wq.frank@bytedance.com>
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Hengqi Chen <hengqi.chen@gmail.com>
Reviewed-by: Hengqi Chen <hengqi.chen@gmail.com>
Link: https://lore.kernel.org/bpf/20211227130713.66933-1-wangqiang.wq.frank@bytedance.com
---
 tools/lib/bpf/libbpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 1d02ba7f11b4..26e49e6aa5b1 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -10017,7 +10017,7 @@ bpf_program__attach_kprobe_opts(const struct bpf_program *prog,
 		gen_kprobe_legacy_event_name(probe_name, sizeof(probe_name),
 					     func_name, offset);
 
-		legacy_probe = strdup(func_name);
+		legacy_probe = strdup(probe_name);
 		if (!legacy_probe)
 			return libbpf_err_ptr(-ENOMEM);
 
-- 
cgit v1.2.3


From 51a33c60f1c22c0d2dafad774315ba1537765442 Mon Sep 17 00:00:00 2001
From: Qiang Wang <wangqiang.wq.frank@bytedance.com>
Date: Mon, 27 Dec 2021 21:07:13 +0800
Subject: libbpf: Support repeated legacy kprobes on same function

If repeated legacy kprobes on same function in one process,
libbpf will register using the same probe name and got -EBUSY
error. So append index to the probe name format to fix this
problem.

Co-developed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Qiang Wang <wangqiang.wq.frank@bytedance.com>
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211227130713.66933-2-wangqiang.wq.frank@bytedance.com
---
 tools/lib/bpf/libbpf.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 26e49e6aa5b1..7f10dd501a52 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -9916,7 +9916,10 @@ static int append_to_file(const char *file, const char *fmt, ...)
 static void gen_kprobe_legacy_event_name(char *buf, size_t buf_sz,
 					 const char *kfunc_name, size_t offset)
 {
-	snprintf(buf, buf_sz, "libbpf_%u_%s_0x%zx", getpid(), kfunc_name, offset);
+	static int index = 0;
+
+	snprintf(buf, buf_sz, "libbpf_%u_%s_0x%zx_%d", getpid(), kfunc_name, offset,
+		 __sync_fetch_and_add(&index, 1));
 }
 
 static int add_kprobe_event_legacy(const char *probe_name, bool retprobe,
-- 
cgit v1.2.3


From 9855c131b9c8b0327ff5182f88bb1991f212415b Mon Sep 17 00:00:00 2001
From: Christy Lee <christylee@fb.com>
Date: Tue, 4 Jan 2022 16:06:01 -0800
Subject: libbpf 1.0: Deprecate bpf_map__is_offload_neutral()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Deprecate bpf_map__is_offload_neutral(). It’s most probably broken
already. PERF_EVENT_ARRAY isn’t the only map that’s not suitable
for hardware offloading. Applications can directly check map type
instead.

  [0] Closes: https://github.com/libbpf/libbpf/issues/306

Signed-off-by: Christy Lee <christylee@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220105000601.2090044-1-christylee@fb.com
---
 tools/bpf/bpftool/prog.c | 2 +-
 tools/lib/bpf/libbpf.h   | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index f874896c4154..2a21d50516bc 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -1655,7 +1655,7 @@ static int load_with_options(int argc, char **argv, bool first_prog_only)
 	j = 0;
 	idx = 0;
 	bpf_object__for_each_map(map, obj) {
-		if (!bpf_map__is_offload_neutral(map))
+		if (bpf_map__type(map) != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
 			bpf_map__set_ifindex(map, ifindex);
 
 		if (j < old_map_fds && idx == map_replace[j].idx) {
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index ddf1cc9e7803..88dd943ba545 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -744,6 +744,7 @@ LIBBPF_API void *bpf_map__priv(const struct bpf_map *map);
 LIBBPF_API int bpf_map__set_initial_value(struct bpf_map *map,
 					  const void *data, size_t size);
 LIBBPF_API const void *bpf_map__initial_value(struct bpf_map *map, size_t *psize);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_map__type() instead")
 LIBBPF_API bool bpf_map__is_offload_neutral(const struct bpf_map *map);
 
 /**
-- 
cgit v1.2.3


From 5f6082642814050352a3e29f8713796b55ebf788 Mon Sep 17 00:00:00 2001
From: Christy Lee <christylee@fb.com>
Date: Tue, 4 Jan 2022 16:31:20 -0800
Subject: libbpf 1.0: Deprecate bpf_object__find_map_by_offset() API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

API created with simplistic assumptions about BPF map definitions.
It hasn’t worked for a while, deprecate it in preparation for
libbpf 1.0.

  [0] Closes: https://github.com/libbpf/libbpf/issues/302

Signed-off-by: Christy Lee <christylee@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220105003120.2222673-1-christylee@fb.com
---
 tools/lib/bpf/libbpf.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 88dd943ba545..8b9bc5e90c2b 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -677,7 +677,8 @@ bpf_object__find_map_fd_by_name(const struct bpf_object *obj, const char *name);
  * Get bpf_map through the offset of corresponding struct bpf_map_def
  * in the BPF object file.
  */
-LIBBPF_API struct bpf_map *
+LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_object__find_map_by_name() instead")
+struct bpf_map *
 bpf_object__find_map_by_offset(struct bpf_object *obj, size_t offset);
 
 LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_object__next_map() instead")
-- 
cgit v1.2.3


From 87eee9c5589e1e546537a38a72e7ea3f0a7cac60 Mon Sep 17 00:00:00 2001
From: Miroslav Lichvar <mlichvar@redhat.com>
Date: Wed, 5 Jan 2022 16:25:06 +0100
Subject: testptp: set pin function before other requests

When the -L option of the testptp utility is specified with other
options (e.g. -p to enable PPS output), the user probably wants to
apply it to the pin configured by the -L option.

Reorder the code to set the pin function before other function requests
to avoid confusing users.

Signed-off-by: Miroslav Lichvar <mlichvar@redhat.com>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Link: https://lore.kernel.org/r/20220105152506.3256026-1-mlichvar@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/ptp/testptp.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ptp/testptp.c b/tools/testing/selftests/ptp/testptp.c
index f7911aaeb007..c0f6a062364d 100644
--- a/tools/testing/selftests/ptp/testptp.c
+++ b/tools/testing/selftests/ptp/testptp.c
@@ -354,6 +354,18 @@ int main(int argc, char *argv[])
 		}
 	}
 
+	if (pin_index >= 0) {
+		memset(&desc, 0, sizeof(desc));
+		desc.index = pin_index;
+		desc.func = pin_func;
+		desc.chan = index;
+		if (ioctl(fd, PTP_PIN_SETFUNC, &desc)) {
+			perror("PTP_PIN_SETFUNC");
+		} else {
+			puts("set pin function okay");
+		}
+	}
+
 	if (extts) {
 		memset(&extts_request, 0, sizeof(extts_request));
 		extts_request.index = index;
@@ -444,18 +456,6 @@ int main(int argc, char *argv[])
 		}
 	}
 
-	if (pin_index >= 0) {
-		memset(&desc, 0, sizeof(desc));
-		desc.index = pin_index;
-		desc.func = pin_func;
-		desc.chan = index;
-		if (ioctl(fd, PTP_PIN_SETFUNC, &desc)) {
-			perror("PTP_PIN_SETFUNC");
-		} else {
-			puts("set pin function okay");
-		}
-	}
-
 	if (pps != -1) {
 		int enable = pps ? 1 : 0;
 		if (ioctl(fd, PTP_ENABLE_PPS, enable)) {
-- 
cgit v1.2.3


From 1585f590a2e5dcae5833b05e030c47229299dd09 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Mon, 27 Dec 2021 11:52:53 +0800
Subject: selftests: netfilter: switch to socat for tests using -q option

The nc cmd(nmap-ncat) that distributed with Fedora/Red Hat does not have
option -q. This make some tests failed with:

	nc: invalid option -- 'q'

Let's switch to socat which is far more dependable.

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 tools/testing/selftests/netfilter/ipip-conntrack-mtu.sh |  9 +++++----
 tools/testing/selftests/netfilter/nf_nat_edemux.sh      | 10 +++++-----
 2 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/netfilter/ipip-conntrack-mtu.sh b/tools/testing/selftests/netfilter/ipip-conntrack-mtu.sh
index 4a6f5c3b3215..eb9553e4986b 100755
--- a/tools/testing/selftests/netfilter/ipip-conntrack-mtu.sh
+++ b/tools/testing/selftests/netfilter/ipip-conntrack-mtu.sh
@@ -41,7 +41,7 @@ checktool (){
 
 checktool "iptables --version" "run test without iptables"
 checktool "ip -Version" "run test without ip tool"
-checktool "which nc" "run test without nc (netcat)"
+checktool "which socat" "run test without socat"
 checktool "ip netns add ${r_a}" "create net namespace"
 
 for n in ${r_b} ${r_w} ${c_a} ${c_b};do
@@ -60,11 +60,12 @@ trap cleanup EXIT
 test_path() {
 	msg="$1"
 
-	ip netns exec ${c_b} nc -n -w 3 -q 3 -u -l -p 5000 > ${rx} < /dev/null &
+	ip netns exec ${c_b} socat -t 3 - udp4-listen:5000,reuseaddr > ${rx} < /dev/null &
 
 	sleep 1
 	for i in 1 2 3; do
-		head -c1400 /dev/zero | tr "\000" "a" | ip netns exec ${c_a} nc -n -w 1 -u 192.168.20.2 5000
+		head -c1400 /dev/zero | tr "\000" "a" | \
+			ip netns exec ${c_a} socat -t 1 -u STDIN UDP:192.168.20.2:5000
 	done
 
 	wait
@@ -189,7 +190,7 @@ ip netns exec ${r_w} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null
 #---------------------
 #Now we send a 1400 bytes UDP packet from Client A to Client B:
 
-# clienta:~# head -c1400 /dev/zero | tr "\000" "a" | nc -u 192.168.20.2 5000
+# clienta:~# head -c1400 /dev/zero | tr "\000" "a" | socat -u STDIN UDP:192.168.20.2:5000
 test_path "without"
 
 # The IPv4 stack on Client A already knows the PMTU to Client B, so the
diff --git a/tools/testing/selftests/netfilter/nf_nat_edemux.sh b/tools/testing/selftests/netfilter/nf_nat_edemux.sh
index cfee3b65be0f..1092bbcb1fba 100755
--- a/tools/testing/selftests/netfilter/nf_nat_edemux.sh
+++ b/tools/testing/selftests/netfilter/nf_nat_edemux.sh
@@ -76,23 +76,23 @@ ip netns exec $ns2 ip route add 10.96.0.1 via 192.168.1.1
 sleep 1
 
 # add a persistent connection from the other namespace
-ip netns exec $ns2 nc -q 10 -w 10 192.168.1.1 5201 > /dev/null &
+ip netns exec $ns2 socat -t 10 - TCP:192.168.1.1:5201 > /dev/null &
 
 sleep 1
 
 # ip daddr:dport will be rewritten to 192.168.1.1 5201
 # NAT must reallocate source port 10000 because
 # 192.168.1.2:10000 -> 192.168.1.1:5201 is already in use
-echo test | ip netns exec $ns2 nc -w 3 -q 3 10.96.0.1 443 >/dev/null
+echo test | ip netns exec $ns2 socat -t 3 -u STDIN TCP:10.96.0.1:443 >/dev/null
 ret=$?
 
 kill $iperfs
 
-# Check nc can connect to 10.96.0.1:443 (aka 192.168.1.1:5201).
+# Check socat can connect to 10.96.0.1:443 (aka 192.168.1.1:5201).
 if [ $ret -eq 0 ]; then
-	echo "PASS: nc can connect via NAT'd address"
+	echo "PASS: socat can connect via NAT'd address"
 else
-	echo "FAIL: nc cannot connect via NAT'd address"
+	echo "FAIL: socat cannot connect via NAT'd address"
 	exit 1
 fi
 
-- 
cgit v1.2.3


From 788490e798a707cea75d80448f5c6c5f322ada6e Mon Sep 17 00:00:00 2001
From: Anup Patel <anup.patel@wdc.com>
Date: Fri, 26 Nov 2021 18:33:45 +0530
Subject: KVM: selftests: Add EXTRA_CFLAGS in top-level Makefile

We add EXTRA_CFLAGS to the common CFLAGS of top-level Makefile which will
allow users to pass additional compile-time flags such as "-static".

Signed-off-by: Anup Patel <anup.patel@wdc.com>
Reviewed-and-tested-by: Atish Patra <atishp@rivosinc.com>
Reviewed-and-tested-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 17342b575e85..869edc2a5a34 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -133,7 +133,7 @@ endif
 CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \
 	-fno-stack-protector -fno-PIE -I$(LINUX_TOOL_INCLUDE) \
 	-I$(LINUX_TOOL_ARCH_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude \
-	-I$(<D) -Iinclude/$(UNAME_M) -I..
+	-I$(<D) -Iinclude/$(UNAME_M) -I.. $(EXTRA_CFLAGS)
 
 no-pie-option := $(call try-run, echo 'int main() { return 0; }' | \
         $(CC) -Werror -no-pie -x c - -o "$$TMP", -no-pie)
-- 
cgit v1.2.3


From 3e06cdf10520e629e711b76b21070d6e67ae7d06 Mon Sep 17 00:00:00 2001
From: Anup Patel <anup.patel@wdc.com>
Date: Tue, 5 Oct 2021 18:09:56 +0530
Subject: KVM: selftests: Add initial support for RISC-V 64-bit

We add initial support for RISC-V 64-bit in KVM selftests using
which we can cross-compile and run arch independent tests such as:
demand_paging_test
dirty_log_test
kvm_create_max_vcpus,
kvm_page_table_test
set_memory_region_test
kvm_binary_stats_test

All VM guest modes defined in kvm_util.h require at least 48-bit
guest virtual address so to use KVM RISC-V selftests hardware
need to support at least Sv48 MMU for guest (i.e. VS-mode).

Signed-off-by: Anup Patel <anup.patel@wdc.com>
Reviewed-and-tested-by: Atish Patra <atishp@rivosinc.com>
---
 tools/testing/selftests/kvm/Makefile               |  12 +
 tools/testing/selftests/kvm/include/kvm_util.h     |  10 +
 .../selftests/kvm/include/riscv/processor.h        | 135 ++++++++
 tools/testing/selftests/kvm/lib/guest_modes.c      |  10 +
 tools/testing/selftests/kvm/lib/riscv/processor.c  | 362 +++++++++++++++++++++
 tools/testing/selftests/kvm/lib/riscv/ucall.c      |  87 +++++
 6 files changed, 616 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/include/riscv/processor.h
 create mode 100644 tools/testing/selftests/kvm/lib/riscv/processor.c
 create mode 100644 tools/testing/selftests/kvm/lib/riscv/ucall.c

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 869edc2a5a34..a2d65f218795 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -32,11 +32,16 @@ endif
 ifeq ($(ARCH),s390)
 	UNAME_M := s390x
 endif
+# Set UNAME_M riscv compile/install to work
+ifeq ($(ARCH),riscv)
+	UNAME_M := riscv
+endif
 
 LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/rbtree.c lib/sparsebit.c lib/test_util.c lib/guest_modes.c lib/perf_test_util.c
 LIBKVM_x86_64 = lib/x86_64/apic.c lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/svm.c lib/x86_64/ucall.c lib/x86_64/handlers.S
 LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c lib/aarch64/handlers.S lib/aarch64/spinlock.c lib/aarch64/gic.c lib/aarch64/gic_v3.c lib/aarch64/vgic.c
 LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_handler.c
+LIBKVM_riscv = lib/riscv/processor.c lib/riscv/ucall.c
 
 TEST_GEN_PROGS_x86_64 = x86_64/cr4_cpuid_sync_test
 TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features
@@ -119,6 +124,13 @@ TEST_GEN_PROGS_s390x += rseq_test
 TEST_GEN_PROGS_s390x += set_memory_region_test
 TEST_GEN_PROGS_s390x += kvm_binary_stats_test
 
+TEST_GEN_PROGS_riscv += demand_paging_test
+TEST_GEN_PROGS_riscv += dirty_log_test
+TEST_GEN_PROGS_riscv += kvm_create_max_vcpus
+TEST_GEN_PROGS_riscv += kvm_page_table_test
+TEST_GEN_PROGS_riscv += set_memory_region_test
+TEST_GEN_PROGS_riscv += kvm_binary_stats_test
+
 TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M))
 LIBKVM += $(LIBKVM_$(UNAME_M))
 
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 2d62edc49d67..da854f997aeb 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -69,6 +69,16 @@ enum vm_guest_mode {
 #define MIN_PAGE_SHIFT			12U
 #define ptes_per_page(page_size)	((page_size) / 16)
 
+#elif defined(__riscv)
+
+#if __riscv_xlen == 32
+#error "RISC-V 32-bit kvm selftests not supported"
+#endif
+
+#define VM_MODE_DEFAULT			VM_MODE_P40V48_4K
+#define MIN_PAGE_SHIFT			12U
+#define ptes_per_page(page_size)	((page_size) / 8)
+
 #endif
 
 #define MIN_PAGE_SIZE		(1U << MIN_PAGE_SHIFT)
diff --git a/tools/testing/selftests/kvm/include/riscv/processor.h b/tools/testing/selftests/kvm/include/riscv/processor.h
new file mode 100644
index 000000000000..dc284c6bdbc3
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/riscv/processor.h
@@ -0,0 +1,135 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * RISC-V processor specific defines
+ *
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ */
+#ifndef SELFTEST_KVM_PROCESSOR_H
+#define SELFTEST_KVM_PROCESSOR_H
+
+#include "kvm_util.h"
+#include <linux/stringify.h>
+
+static inline uint64_t __kvm_reg_id(uint64_t type, uint64_t idx,
+				    uint64_t  size)
+{
+	return KVM_REG_RISCV | type | idx | size;
+}
+
+#if __riscv_xlen == 64
+#define KVM_REG_SIZE_ULONG	KVM_REG_SIZE_U64
+#else
+#define KVM_REG_SIZE_ULONG	KVM_REG_SIZE_U32
+#endif
+
+#define RISCV_CONFIG_REG(name)	__kvm_reg_id(KVM_REG_RISCV_CONFIG, \
+					     KVM_REG_RISCV_CONFIG_REG(name), \
+					     KVM_REG_SIZE_ULONG)
+
+#define RISCV_CORE_REG(name)	__kvm_reg_id(KVM_REG_RISCV_CORE, \
+					     KVM_REG_RISCV_CORE_REG(name), \
+					     KVM_REG_SIZE_ULONG)
+
+#define RISCV_CSR_REG(name)	__kvm_reg_id(KVM_REG_RISCV_CSR, \
+					     KVM_REG_RISCV_CSR_REG(name), \
+					     KVM_REG_SIZE_ULONG)
+
+#define RISCV_TIMER_REG(name)	__kvm_reg_id(KVM_REG_RISCV_TIMER, \
+					     KVM_REG_RISCV_TIMER_REG(name), \
+					     KVM_REG_SIZE_U64)
+
+static inline void get_reg(struct kvm_vm *vm, uint32_t vcpuid, uint64_t id,
+			   unsigned long *addr)
+{
+	struct kvm_one_reg reg;
+
+	reg.id = id;
+	reg.addr = (unsigned long)addr;
+	vcpu_get_reg(vm, vcpuid, &reg);
+}
+
+static inline void set_reg(struct kvm_vm *vm, uint32_t vcpuid, uint64_t id,
+			   unsigned long val)
+{
+	struct kvm_one_reg reg;
+
+	reg.id = id;
+	reg.addr = (unsigned long)&val;
+	vcpu_set_reg(vm, vcpuid, &reg);
+}
+
+/* L3 index Bit[47:39] */
+#define PGTBL_L3_INDEX_MASK			0x0000FF8000000000ULL
+#define PGTBL_L3_INDEX_SHIFT			39
+#define PGTBL_L3_BLOCK_SHIFT			39
+#define PGTBL_L3_BLOCK_SIZE			0x0000008000000000ULL
+#define PGTBL_L3_MAP_MASK			(~(PGTBL_L3_BLOCK_SIZE - 1))
+/* L2 index Bit[38:30] */
+#define PGTBL_L2_INDEX_MASK			0x0000007FC0000000ULL
+#define PGTBL_L2_INDEX_SHIFT			30
+#define PGTBL_L2_BLOCK_SHIFT			30
+#define PGTBL_L2_BLOCK_SIZE			0x0000000040000000ULL
+#define PGTBL_L2_MAP_MASK			(~(PGTBL_L2_BLOCK_SIZE - 1))
+/* L1 index Bit[29:21] */
+#define PGTBL_L1_INDEX_MASK			0x000000003FE00000ULL
+#define PGTBL_L1_INDEX_SHIFT			21
+#define PGTBL_L1_BLOCK_SHIFT			21
+#define PGTBL_L1_BLOCK_SIZE			0x0000000000200000ULL
+#define PGTBL_L1_MAP_MASK			(~(PGTBL_L1_BLOCK_SIZE - 1))
+/* L0 index Bit[20:12] */
+#define PGTBL_L0_INDEX_MASK			0x00000000001FF000ULL
+#define PGTBL_L0_INDEX_SHIFT			12
+#define PGTBL_L0_BLOCK_SHIFT			12
+#define PGTBL_L0_BLOCK_SIZE			0x0000000000001000ULL
+#define PGTBL_L0_MAP_MASK			(~(PGTBL_L0_BLOCK_SIZE - 1))
+
+#define PGTBL_PTE_ADDR_MASK			0x003FFFFFFFFFFC00ULL
+#define PGTBL_PTE_ADDR_SHIFT			10
+#define PGTBL_PTE_RSW_MASK			0x0000000000000300ULL
+#define PGTBL_PTE_RSW_SHIFT			8
+#define PGTBL_PTE_DIRTY_MASK			0x0000000000000080ULL
+#define PGTBL_PTE_DIRTY_SHIFT			7
+#define PGTBL_PTE_ACCESSED_MASK			0x0000000000000040ULL
+#define PGTBL_PTE_ACCESSED_SHIFT		6
+#define PGTBL_PTE_GLOBAL_MASK			0x0000000000000020ULL
+#define PGTBL_PTE_GLOBAL_SHIFT			5
+#define PGTBL_PTE_USER_MASK			0x0000000000000010ULL
+#define PGTBL_PTE_USER_SHIFT			4
+#define PGTBL_PTE_EXECUTE_MASK			0x0000000000000008ULL
+#define PGTBL_PTE_EXECUTE_SHIFT			3
+#define PGTBL_PTE_WRITE_MASK			0x0000000000000004ULL
+#define PGTBL_PTE_WRITE_SHIFT			2
+#define PGTBL_PTE_READ_MASK			0x0000000000000002ULL
+#define PGTBL_PTE_READ_SHIFT			1
+#define PGTBL_PTE_PERM_MASK			(PGTBL_PTE_EXECUTE_MASK | \
+						 PGTBL_PTE_WRITE_MASK | \
+						 PGTBL_PTE_READ_MASK)
+#define PGTBL_PTE_VALID_MASK			0x0000000000000001ULL
+#define PGTBL_PTE_VALID_SHIFT			0
+
+#define PGTBL_PAGE_SIZE				PGTBL_L0_BLOCK_SIZE
+#define PGTBL_PAGE_SIZE_SHIFT			PGTBL_L0_BLOCK_SHIFT
+
+#define SATP_PPN				_AC(0x00000FFFFFFFFFFF, UL)
+#define SATP_MODE_39				_AC(0x8000000000000000, UL)
+#define SATP_MODE_48				_AC(0x9000000000000000, UL)
+#define SATP_ASID_BITS				16
+#define SATP_ASID_SHIFT				44
+#define SATP_ASID_MASK				_AC(0xFFFF, UL)
+
+#define SBI_EXT_EXPERIMENTAL_START	0x08000000
+#define SBI_EXT_EXPERIMENTAL_END	0x08FFFFFF
+
+#define KVM_RISCV_SELFTESTS_SBI_EXT	SBI_EXT_EXPERIMENTAL_END
+
+struct sbiret {
+	long error;
+	long value;
+};
+
+struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0,
+			unsigned long arg1, unsigned long arg2,
+			unsigned long arg3, unsigned long arg4,
+			unsigned long arg5);
+
+#endif /* SELFTEST_KVM_PROCESSOR_H */
diff --git a/tools/testing/selftests/kvm/lib/guest_modes.c b/tools/testing/selftests/kvm/lib/guest_modes.c
index c330f414ef96..57839a12e472 100644
--- a/tools/testing/selftests/kvm/lib/guest_modes.c
+++ b/tools/testing/selftests/kvm/lib/guest_modes.c
@@ -38,6 +38,16 @@ void guest_modes_append_default(void)
 			guest_mode_append(VM_MODE_P47V64_4K, true, true);
 	}
 #endif
+#ifdef __riscv
+	{
+		unsigned int sz = kvm_check_cap(KVM_CAP_VM_GPA_BITS);
+
+		if (sz >= 52)
+			guest_mode_append(VM_MODE_P52V48_4K, true, true);
+		if (sz >= 48)
+			guest_mode_append(VM_MODE_P48V48_4K, true, true);
+	}
+#endif
 }
 
 void for_each_guest_mode(void (*func)(enum vm_guest_mode, void *), void *arg)
diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c
new file mode 100644
index 000000000000..d377f2603d98
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/riscv/processor.c
@@ -0,0 +1,362 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * RISC-V code
+ *
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ */
+
+#include <linux/compiler.h>
+#include <assert.h>
+
+#include "kvm_util.h"
+#include "../kvm_util_internal.h"
+#include "processor.h"
+
+#define DEFAULT_RISCV_GUEST_STACK_VADDR_MIN	0xac0000
+
+static uint64_t page_align(struct kvm_vm *vm, uint64_t v)
+{
+	return (v + vm->page_size) & ~(vm->page_size - 1);
+}
+
+static uint64_t pte_addr(struct kvm_vm *vm, uint64_t entry)
+{
+	return ((entry & PGTBL_PTE_ADDR_MASK) >> PGTBL_PTE_ADDR_SHIFT) <<
+		PGTBL_PAGE_SIZE_SHIFT;
+}
+
+static uint64_t ptrs_per_pte(struct kvm_vm *vm)
+{
+	return PGTBL_PAGE_SIZE / sizeof(uint64_t);
+}
+
+static uint64_t pte_index_mask[] = {
+	PGTBL_L0_INDEX_MASK,
+	PGTBL_L1_INDEX_MASK,
+	PGTBL_L2_INDEX_MASK,
+	PGTBL_L3_INDEX_MASK,
+};
+
+static uint32_t pte_index_shift[] = {
+	PGTBL_L0_INDEX_SHIFT,
+	PGTBL_L1_INDEX_SHIFT,
+	PGTBL_L2_INDEX_SHIFT,
+	PGTBL_L3_INDEX_SHIFT,
+};
+
+static uint64_t pte_index(struct kvm_vm *vm, vm_vaddr_t gva, int level)
+{
+	TEST_ASSERT(level > -1,
+		"Negative page table level (%d) not possible", level);
+	TEST_ASSERT(level < vm->pgtable_levels,
+		"Invalid page table level (%d)", level);
+
+	return (gva & pte_index_mask[level]) >> pte_index_shift[level];
+}
+
+void virt_pgd_alloc(struct kvm_vm *vm)
+{
+	if (!vm->pgd_created) {
+		vm_paddr_t paddr = vm_phy_pages_alloc(vm,
+			page_align(vm, ptrs_per_pte(vm) * 8) / vm->page_size,
+			KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0);
+		vm->pgd = paddr;
+		vm->pgd_created = true;
+	}
+}
+
+void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
+{
+	uint64_t *ptep, next_ppn;
+	int level = vm->pgtable_levels - 1;
+
+	TEST_ASSERT((vaddr % vm->page_size) == 0,
+		"Virtual address not on page boundary,\n"
+		"  vaddr: 0x%lx vm->page_size: 0x%x", vaddr, vm->page_size);
+	TEST_ASSERT(sparsebit_is_set(vm->vpages_valid,
+		(vaddr >> vm->page_shift)),
+		"Invalid virtual address, vaddr: 0x%lx", vaddr);
+	TEST_ASSERT((paddr % vm->page_size) == 0,
+		"Physical address not on page boundary,\n"
+		"  paddr: 0x%lx vm->page_size: 0x%x", paddr, vm->page_size);
+	TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn,
+		"Physical address beyond maximum supported,\n"
+		"  paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
+		paddr, vm->max_gfn, vm->page_size);
+
+	ptep = addr_gpa2hva(vm, vm->pgd) + pte_index(vm, vaddr, level) * 8;
+	if (!*ptep) {
+		next_ppn = vm_alloc_page_table(vm) >> PGTBL_PAGE_SIZE_SHIFT;
+		*ptep = (next_ppn << PGTBL_PTE_ADDR_SHIFT) |
+			PGTBL_PTE_VALID_MASK;
+	}
+	level--;
+
+	while (level > -1) {
+		ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) +
+		       pte_index(vm, vaddr, level) * 8;
+		if (!*ptep && level > 0) {
+			next_ppn = vm_alloc_page_table(vm) >>
+				   PGTBL_PAGE_SIZE_SHIFT;
+			*ptep = (next_ppn << PGTBL_PTE_ADDR_SHIFT) |
+				PGTBL_PTE_VALID_MASK;
+		}
+		level--;
+	}
+
+	paddr = paddr >> PGTBL_PAGE_SIZE_SHIFT;
+	*ptep = (paddr << PGTBL_PTE_ADDR_SHIFT) |
+		PGTBL_PTE_PERM_MASK | PGTBL_PTE_VALID_MASK;
+}
+
+vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+	uint64_t *ptep;
+	int level = vm->pgtable_levels - 1;
+
+	if (!vm->pgd_created)
+		goto unmapped_gva;
+
+	ptep = addr_gpa2hva(vm, vm->pgd) + pte_index(vm, gva, level) * 8;
+	if (!ptep)
+		goto unmapped_gva;
+	level--;
+
+	while (level > -1) {
+		ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) +
+		       pte_index(vm, gva, level) * 8;
+		if (!ptep)
+			goto unmapped_gva;
+		level--;
+	}
+
+	return pte_addr(vm, *ptep) + (gva & (vm->page_size - 1));
+
+unmapped_gva:
+	TEST_FAIL("No mapping for vm virtual address gva: 0x%lx level: %d",
+		  gva, level);
+	exit(1);
+}
+
+static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent,
+		     uint64_t page, int level)
+{
+#ifdef DEBUG
+	static const char *const type[] = { "pte", "pmd", "pud", "p4d"};
+	uint64_t pte, *ptep;
+
+	if (level < 0)
+		return;
+
+	for (pte = page; pte < page + ptrs_per_pte(vm) * 8; pte += 8) {
+		ptep = addr_gpa2hva(vm, pte);
+		if (!*ptep)
+			continue;
+		fprintf(stream, "%*s%s: %lx: %lx at %p\n", indent, "",
+			type[level], pte, *ptep, ptep);
+		pte_dump(stream, vm, indent + 1,
+			 pte_addr(vm, *ptep), level - 1);
+	}
+#endif
+}
+
+void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
+{
+	int level = vm->pgtable_levels - 1;
+	uint64_t pgd, *ptep;
+
+	if (!vm->pgd_created)
+		return;
+
+	for (pgd = vm->pgd; pgd < vm->pgd + ptrs_per_pte(vm) * 8; pgd += 8) {
+		ptep = addr_gpa2hva(vm, pgd);
+		if (!*ptep)
+			continue;
+		fprintf(stream, "%*spgd: %lx: %lx at %p\n", indent, "",
+			pgd, *ptep, ptep);
+		pte_dump(stream, vm, indent + 1,
+			 pte_addr(vm, *ptep), level - 1);
+	}
+}
+
+void riscv_vcpu_mmu_setup(struct kvm_vm *vm, int vcpuid)
+{
+	unsigned long satp;
+
+	/*
+	 * The RISC-V Sv48 MMU mode supports 56-bit physical address
+	 * for 48-bit virtual address with 4KB last level page size.
+	 */
+	switch (vm->mode) {
+	case VM_MODE_P52V48_4K:
+	case VM_MODE_P48V48_4K:
+	case VM_MODE_P40V48_4K:
+		break;
+	default:
+		TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode);
+	}
+
+	satp = (vm->pgd >> PGTBL_PAGE_SIZE_SHIFT) & SATP_PPN;
+	satp |= SATP_MODE_48;
+
+	set_reg(vm, vcpuid, RISCV_CSR_REG(satp), satp);
+}
+
+void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent)
+{
+	struct kvm_riscv_core core;
+
+	get_reg(vm, vcpuid, RISCV_CORE_REG(mode), &core.mode);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.pc), &core.regs.pc);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.ra), &core.regs.ra);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.sp), &core.regs.sp);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.gp), &core.regs.gp);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.tp), &core.regs.tp);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.t0), &core.regs.t0);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.t1), &core.regs.t1);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.t2), &core.regs.t2);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s0), &core.regs.s0);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s1), &core.regs.s1);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a0), &core.regs.a0);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a1), &core.regs.a1);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a2), &core.regs.a2);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a3), &core.regs.a3);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a4), &core.regs.a4);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a5), &core.regs.a5);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a6), &core.regs.a6);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.a7), &core.regs.a7);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s2), &core.regs.s2);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s3), &core.regs.s3);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s4), &core.regs.s4);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s5), &core.regs.s5);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s6), &core.regs.s6);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s7), &core.regs.s7);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s8), &core.regs.s8);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s9), &core.regs.s9);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s10), &core.regs.s10);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.s11), &core.regs.s11);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.t3), &core.regs.t3);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.t4), &core.regs.t4);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.t5), &core.regs.t5);
+	get_reg(vm, vcpuid, RISCV_CORE_REG(regs.t6), &core.regs.t6);
+
+	fprintf(stream,
+		" MODE:  0x%lx\n", core.mode);
+	fprintf(stream,
+		" PC: 0x%016lx   RA: 0x%016lx SP: 0x%016lx GP: 0x%016lx\n",
+		core.regs.pc, core.regs.ra, core.regs.sp, core.regs.gp);
+	fprintf(stream,
+		" TP: 0x%016lx   T0: 0x%016lx T1: 0x%016lx T2: 0x%016lx\n",
+		core.regs.tp, core.regs.t0, core.regs.t1, core.regs.t2);
+	fprintf(stream,
+		" S0: 0x%016lx   S1: 0x%016lx A0: 0x%016lx A1: 0x%016lx\n",
+		core.regs.s0, core.regs.s1, core.regs.a0, core.regs.a1);
+	fprintf(stream,
+		" A2: 0x%016lx   A3: 0x%016lx A4: 0x%016lx A5: 0x%016lx\n",
+		core.regs.a2, core.regs.a3, core.regs.a4, core.regs.a5);
+	fprintf(stream,
+		" A6: 0x%016lx   A7: 0x%016lx S2: 0x%016lx S3: 0x%016lx\n",
+		core.regs.a6, core.regs.a7, core.regs.s2, core.regs.s3);
+	fprintf(stream,
+		" S4: 0x%016lx   S5: 0x%016lx S6: 0x%016lx S7: 0x%016lx\n",
+		core.regs.s4, core.regs.s5, core.regs.s6, core.regs.s7);
+	fprintf(stream,
+		" S8: 0x%016lx   S9: 0x%016lx S10: 0x%016lx S11: 0x%016lx\n",
+		core.regs.s8, core.regs.s9, core.regs.s10, core.regs.s11);
+	fprintf(stream,
+		" T3: 0x%016lx   T4: 0x%016lx T5: 0x%016lx T6: 0x%016lx\n",
+		core.regs.t3, core.regs.t4, core.regs.t5, core.regs.t6);
+}
+
+static void guest_hang(void)
+{
+	while (1)
+		;
+}
+
+void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code)
+{
+	int r;
+	size_t stack_size = vm->page_size == 4096 ?
+					DEFAULT_STACK_PGS * vm->page_size :
+					vm->page_size;
+	unsigned long stack_vaddr = vm_vaddr_alloc(vm, stack_size,
+					DEFAULT_RISCV_GUEST_STACK_VADDR_MIN);
+	unsigned long current_gp = 0;
+	struct kvm_mp_state mps;
+
+	vm_vcpu_add(vm, vcpuid);
+	riscv_vcpu_mmu_setup(vm, vcpuid);
+
+	/*
+	 * With SBI HSM support in KVM RISC-V, all secondary VCPUs are
+	 * powered-off by default so we ensure that all secondary VCPUs
+	 * are powered-on using KVM_SET_MP_STATE ioctl().
+	 */
+	mps.mp_state = KVM_MP_STATE_RUNNABLE;
+	r = _vcpu_ioctl(vm, vcpuid, KVM_SET_MP_STATE, &mps);
+	TEST_ASSERT(!r, "IOCTL KVM_SET_MP_STATE failed (error %d)", r);
+
+	/* Setup global pointer of guest to be same as the host */
+	asm volatile (
+		"add %0, gp, zero" : "=r" (current_gp) : : "memory");
+	set_reg(vm, vcpuid, RISCV_CORE_REG(regs.gp), current_gp);
+
+	/* Setup stack pointer and program counter of guest */
+	set_reg(vm, vcpuid, RISCV_CORE_REG(regs.sp),
+		stack_vaddr + stack_size);
+	set_reg(vm, vcpuid, RISCV_CORE_REG(regs.pc),
+		(unsigned long)guest_code);
+
+	/* Setup default exception vector of guest */
+	set_reg(vm, vcpuid, RISCV_CSR_REG(stvec),
+		(unsigned long)guest_hang);
+}
+
+void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...)
+{
+	va_list ap;
+	uint64_t id = RISCV_CORE_REG(regs.a0);
+	int i;
+
+	TEST_ASSERT(num >= 1 && num <= 8, "Unsupported number of args,\n"
+		    "  num: %u\n", num);
+
+	va_start(ap, num);
+
+	for (i = 0; i < num; i++) {
+		switch (i) {
+		case 0:
+			id = RISCV_CORE_REG(regs.a0);
+			break;
+		case 1:
+			id = RISCV_CORE_REG(regs.a1);
+			break;
+		case 2:
+			id = RISCV_CORE_REG(regs.a2);
+			break;
+		case 3:
+			id = RISCV_CORE_REG(regs.a3);
+			break;
+		case 4:
+			id = RISCV_CORE_REG(regs.a4);
+			break;
+		case 5:
+			id = RISCV_CORE_REG(regs.a5);
+			break;
+		case 6:
+			id = RISCV_CORE_REG(regs.a6);
+			break;
+		case 7:
+			id = RISCV_CORE_REG(regs.a7);
+			break;
+		};
+		set_reg(vm, vcpuid, id, va_arg(ap, uint64_t));
+	}
+
+	va_end(ap);
+}
+
+void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid)
+{
+}
diff --git a/tools/testing/selftests/kvm/lib/riscv/ucall.c b/tools/testing/selftests/kvm/lib/riscv/ucall.c
new file mode 100644
index 000000000000..9e42d8248fa6
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/riscv/ucall.c
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ucall support. A ucall is a "hypercall to userspace".
+ *
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ */
+
+#include <linux/kvm.h>
+
+#include "kvm_util.h"
+#include "../kvm_util_internal.h"
+#include "processor.h"
+
+void ucall_init(struct kvm_vm *vm, void *arg)
+{
+}
+
+void ucall_uninit(struct kvm_vm *vm)
+{
+}
+
+struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0,
+			unsigned long arg1, unsigned long arg2,
+			unsigned long arg3, unsigned long arg4,
+			unsigned long arg5)
+{
+	register uintptr_t a0 asm ("a0") = (uintptr_t)(arg0);
+	register uintptr_t a1 asm ("a1") = (uintptr_t)(arg1);
+	register uintptr_t a2 asm ("a2") = (uintptr_t)(arg2);
+	register uintptr_t a3 asm ("a3") = (uintptr_t)(arg3);
+	register uintptr_t a4 asm ("a4") = (uintptr_t)(arg4);
+	register uintptr_t a5 asm ("a5") = (uintptr_t)(arg5);
+	register uintptr_t a6 asm ("a6") = (uintptr_t)(fid);
+	register uintptr_t a7 asm ("a7") = (uintptr_t)(ext);
+	struct sbiret ret;
+
+	asm volatile (
+		"ecall"
+		: "+r" (a0), "+r" (a1)
+		: "r" (a2), "r" (a3), "r" (a4), "r" (a5), "r" (a6), "r" (a7)
+		: "memory");
+	ret.error = a0;
+	ret.value = a1;
+
+	return ret;
+}
+
+void ucall(uint64_t cmd, int nargs, ...)
+{
+	struct ucall uc = {
+		.cmd = cmd,
+	};
+	va_list va;
+	int i;
+
+	nargs = nargs <= UCALL_MAX_ARGS ? nargs : UCALL_MAX_ARGS;
+
+	va_start(va, nargs);
+	for (i = 0; i < nargs; ++i)
+		uc.args[i] = va_arg(va, uint64_t);
+	va_end(va);
+
+	sbi_ecall(KVM_RISCV_SELFTESTS_SBI_EXT, 0, (vm_vaddr_t)&uc,
+		  0, 0, 0, 0, 0);
+}
+
+uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc)
+{
+	struct kvm_run *run = vcpu_state(vm, vcpu_id);
+	struct ucall ucall = {};
+
+	if (uc)
+		memset(uc, 0, sizeof(*uc));
+
+	if (run->exit_reason == KVM_EXIT_RISCV_SBI &&
+	    run->riscv_sbi.extension_id == KVM_RISCV_SELFTESTS_SBI_EXT &&
+	    run->riscv_sbi.function_id == 0) {
+		memcpy(&ucall, addr_gva2hva(vm, run->riscv_sbi.args[0]),
+			sizeof(ucall));
+
+		vcpu_run_complete_io(vm, vcpu_id);
+		if (uc)
+			memcpy(uc, &ucall, sizeof(ucall));
+	}
+
+	return ucall.cmd;
+}
-- 
cgit v1.2.3


From eac1b93c14d645ef147b049ace0d5230df755548 Mon Sep 17 00:00:00 2001
From: Coco Li <lixiaoyan@google.com>
Date: Wed, 5 Jan 2022 02:48:38 -0800
Subject: gro: add ability to control gro max packet size

Eric Dumazet suggested to allow users to modify max GRO packet size.

We have seen GRO being disabled by users of appliances (such as
wifi access points) because of claimed bufferbloat issues,
or some work arounds in sch_cake, to split GRO/GSO packets.

Instead of disabling GRO completely, one can chose to limit
the maximum packet size of GRO packets, depending on their
latency constraints.

This patch adds a per device gro_max_size attribute
that can be changed with ip link command.

ip link set dev eth0 gro_max_size 16000

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Coco Li <lixiaoyan@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h          | 11 +++++++++++
 include/uapi/linux/if_link.h       |  1 +
 net/core/dev.c                     |  1 +
 net/core/gro.c                     |  6 +++++-
 net/core/rtnetlink.c               | 22 ++++++++++++++++++++++
 tools/include/uapi/linux/if_link.h |  1 +
 6 files changed, 41 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6f99c8f51b60..3213c7227b59 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1942,6 +1942,8 @@ enum netdev_ml_priv_type {
  *			dev->addr_list_lock.
  *	@unlink_list:	As netif_addr_lock() can be called recursively,
  *			keep a list of interfaces to be deleted.
+ *	@gro_max_size:	Maximum size of aggregated packet in generic
+ *			receive offload (GRO)
  *
  *	@dev_addr_shadow:	Copy of @dev_addr to catch direct writes.
  *	@linkwatch_dev_tracker:	refcount tracker used by linkwatch.
@@ -2131,6 +2133,8 @@ struct net_device {
 	struct bpf_prog __rcu	*xdp_prog;
 	unsigned long		gro_flush_timeout;
 	int			napi_defer_hard_irqs;
+#define GRO_MAX_SIZE		65536
+	unsigned int		gro_max_size;
 	rx_handler_func_t __rcu	*rx_handler;
 	void __rcu		*rx_handler_data;
 
@@ -4806,6 +4810,13 @@ static inline void netif_set_gso_max_segs(struct net_device *dev,
 	WRITE_ONCE(dev->gso_max_segs, segs);
 }
 
+static inline void netif_set_gro_max_size(struct net_device *dev,
+					  unsigned int size)
+{
+	/* This pairs with the READ_ONCE() in skb_gro_receive() */
+	WRITE_ONCE(dev->gro_max_size, size);
+}
+
 static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol,
 					int pulled_hlen, u16 mac_offset,
 					int mac_len)
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 4ac53b30b6dc..6218f93f5c1a 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -347,6 +347,7 @@ enum {
 	 */
 	IFLA_PARENT_DEV_NAME,
 	IFLA_PARENT_DEV_BUS_NAME,
+	IFLA_GRO_MAX_SIZE,
 
 	__IFLA_MAX
 };
diff --git a/net/core/dev.c b/net/core/dev.c
index 6c8b226b5f2f..83a4089990a0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10180,6 +10180,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 
 	dev->gso_max_size = GSO_MAX_SIZE;
 	dev->gso_max_segs = GSO_MAX_SEGS;
+	dev->gro_max_size = GRO_MAX_SIZE;
 	dev->upper_level = 1;
 	dev->lower_level = 1;
 #ifdef CONFIG_LOCKDEP
diff --git a/net/core/gro.c b/net/core/gro.c
index 8ec8b44596da..a11b286d1495 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -132,10 +132,14 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
 	unsigned int headlen = skb_headlen(skb);
 	unsigned int len = skb_gro_len(skb);
 	unsigned int delta_truesize;
+	unsigned int gro_max_size;
 	unsigned int new_truesize;
 	struct sk_buff *lp;
 
-	if (unlikely(p->len + len >= 65536 || NAPI_GRO_CB(skb)->flush))
+	/* pairs with WRITE_ONCE() in netif_set_gro_max_size() */
+	gro_max_size = READ_ONCE(p->dev->gro_max_size);
+
+	if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush))
 		return -E2BIG;
 
 	lp = NAPI_GRO_CB(p)->last;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index d6eba554b137..e476403231f0 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1026,6 +1026,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */
 	       + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */
 	       + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */
+	       + nla_total_size(4) /* IFLA_GRO_MAX_SIZE */
 	       + nla_total_size(1) /* IFLA_OPERSTATE */
 	       + nla_total_size(1) /* IFLA_LINKMODE */
 	       + nla_total_size(4) /* IFLA_CARRIER_CHANGES */
@@ -1728,6 +1729,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
 	    nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) ||
 	    nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) ||
 	    nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) ||
+	    nla_put_u32(skb, IFLA_GRO_MAX_SIZE, dev->gro_max_size) ||
 #ifdef CONFIG_RPS
 	    nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) ||
 #endif
@@ -1880,6 +1882,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED },
 	[IFLA_NEW_IFINDEX]	= NLA_POLICY_MIN(NLA_S32, 1),
 	[IFLA_PARENT_DEV_NAME]	= { .type = NLA_NUL_STRING },
+	[IFLA_GRO_MAX_SIZE]	= { .type = NLA_U32 },
 };
 
 static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -2299,6 +2302,14 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[],
 		}
 	}
 
+	if (tb[IFLA_GRO_MAX_SIZE]) {
+		u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_MAX_SIZE]);
+
+		if (gro_max_size > GRO_MAX_SIZE) {
+			NL_SET_ERR_MSG(extack, "too big gro_max_size");
+			return -EINVAL;
+		}
+	}
 	return 0;
 }
 
@@ -2772,6 +2783,15 @@ static int do_setlink(const struct sk_buff *skb,
 		}
 	}
 
+	if (tb[IFLA_GRO_MAX_SIZE]) {
+		u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_MAX_SIZE]);
+
+		if (dev->gro_max_size ^ gro_max_size) {
+			netif_set_gro_max_size(dev, gro_max_size);
+			status |= DO_SETLINK_MODIFIED;
+		}
+	}
+
 	if (tb[IFLA_OPERSTATE])
 		set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
 
@@ -3222,6 +3242,8 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname,
 		netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE]));
 	if (tb[IFLA_GSO_MAX_SEGS])
 		netif_set_gso_max_segs(dev, nla_get_u32(tb[IFLA_GSO_MAX_SEGS]));
+	if (tb[IFLA_GRO_MAX_SIZE])
+		netif_set_gro_max_size(dev, nla_get_u32(tb[IFLA_GRO_MAX_SIZE]));
 
 	return dev;
 }
diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index 4ac53b30b6dc..6218f93f5c1a 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -347,6 +347,7 @@ enum {
 	 */
 	IFLA_PARENT_DEV_NAME,
 	IFLA_PARENT_DEV_BUS_NAME,
+	IFLA_GRO_MAX_SIZE,
 
 	__IFLA_MAX
 };
-- 
cgit v1.2.3


From b09c2baa56347ae65795350dfcc633dedb1c2970 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 6 Jan 2022 11:02:29 -1000
Subject: selftests: cgroup: Make cg_create() use 0755 for permission instead
 of 0644
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

0644 is an odd perm to create a cgroup which is a directory. Use the regular
0755 instead. This is necessary for euid switching test case.

Reviewed-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/cgroup/cgroup_util.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c
index 623cec04ad42..0cf7e90c0052 100644
--- a/tools/testing/selftests/cgroup/cgroup_util.c
+++ b/tools/testing/selftests/cgroup/cgroup_util.c
@@ -221,7 +221,7 @@ int cg_find_unified_root(char *root, size_t len)
 
 int cg_create(const char *cgroup)
 {
-	return mkdir(cgroup, 0644);
+	return mkdir(cgroup, 0755);
 }
 
 int cg_wait_for_proc_count(const char *cgroup, int count)
-- 
cgit v1.2.3


From 613e040e4dc285367bff0f8f75ea59839bc10947 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 6 Jan 2022 11:02:29 -1000
Subject: selftests: cgroup: Test open-time credential usage for migration
 checks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a task is writing to an fd opened by a different task, the perm check
should use the credentials of the latter task. Add a test for it.

Tested-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/cgroup/test_core.c | 68 ++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c
index 3df648c37876..01b766506973 100644
--- a/tools/testing/selftests/cgroup/test_core.c
+++ b/tools/testing/selftests/cgroup/test_core.c
@@ -674,6 +674,73 @@ cleanup:
 	return ret;
 }
 
+/*
+ * cgroup migration permission check should be performed based on the
+ * credentials at the time of open instead of write.
+ */
+static int test_cgcore_lesser_euid_open(const char *root)
+{
+	const uid_t test_euid = 65534;	/* usually nobody, any !root is fine */
+	int ret = KSFT_FAIL;
+	char *cg_test_a = NULL, *cg_test_b = NULL;
+	char *cg_test_a_procs = NULL, *cg_test_b_procs = NULL;
+	int cg_test_b_procs_fd = -1;
+	uid_t saved_uid;
+
+	cg_test_a = cg_name(root, "cg_test_a");
+	cg_test_b = cg_name(root, "cg_test_b");
+
+	if (!cg_test_a || !cg_test_b)
+		goto cleanup;
+
+	cg_test_a_procs = cg_name(cg_test_a, "cgroup.procs");
+	cg_test_b_procs = cg_name(cg_test_b, "cgroup.procs");
+
+	if (!cg_test_a_procs || !cg_test_b_procs)
+		goto cleanup;
+
+	if (cg_create(cg_test_a) || cg_create(cg_test_b))
+		goto cleanup;
+
+	if (cg_enter_current(cg_test_a))
+		goto cleanup;
+
+	if (chown(cg_test_a_procs, test_euid, -1) ||
+	    chown(cg_test_b_procs, test_euid, -1))
+		goto cleanup;
+
+	saved_uid = geteuid();
+	if (seteuid(test_euid))
+		goto cleanup;
+
+	cg_test_b_procs_fd = open(cg_test_b_procs, O_RDWR);
+
+	if (seteuid(saved_uid))
+		goto cleanup;
+
+	if (cg_test_b_procs_fd < 0)
+		goto cleanup;
+
+	if (write(cg_test_b_procs_fd, "0", 1) >= 0 || errno != EACCES)
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	cg_enter_current(root);
+	if (cg_test_b_procs_fd >= 0)
+		close(cg_test_b_procs_fd);
+	if (cg_test_b)
+		cg_destroy(cg_test_b);
+	if (cg_test_a)
+		cg_destroy(cg_test_a);
+	free(cg_test_b_procs);
+	free(cg_test_a_procs);
+	free(cg_test_b);
+	free(cg_test_a);
+	return ret;
+}
+
 #define T(x) { x, #x }
 struct corecg_test {
 	int (*fn)(const char *root);
@@ -689,6 +756,7 @@ struct corecg_test {
 	T(test_cgcore_proc_migration),
 	T(test_cgcore_thread_migration),
 	T(test_cgcore_destroy),
+	T(test_cgcore_lesser_euid_open),
 };
 #undef T
 
-- 
cgit v1.2.3


From bf35a7879f1dfb0d050fe779168bcf25c7de66f5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 6 Jan 2022 11:02:29 -1000
Subject: selftests: cgroup: Test open-time cgroup namespace usage for
 migration checks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a task is writing to an fd opened by a different task, the perm check
should use the cgroup namespace of the latter task. Add a test for it.

Tested-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/cgroup/test_core.c | 97 ++++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c
index 01b766506973..600123503063 100644
--- a/tools/testing/selftests/cgroup/test_core.c
+++ b/tools/testing/selftests/cgroup/test_core.c
@@ -1,11 +1,14 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
+#define _GNU_SOURCE
 #include <linux/limits.h>
+#include <linux/sched.h>
 #include <sys/types.h>
 #include <sys/mman.h>
 #include <sys/wait.h>
 #include <unistd.h>
 #include <fcntl.h>
+#include <sched.h>
 #include <stdio.h>
 #include <errno.h>
 #include <signal.h>
@@ -741,6 +744,99 @@ cleanup:
 	return ret;
 }
 
+struct lesser_ns_open_thread_arg {
+	const char	*path;
+	int		fd;
+	int		err;
+};
+
+static int lesser_ns_open_thread_fn(void *arg)
+{
+	struct lesser_ns_open_thread_arg *targ = arg;
+
+	targ->fd = open(targ->path, O_RDWR);
+	targ->err = errno;
+	return 0;
+}
+
+/*
+ * cgroup migration permission check should be performed based on the cgroup
+ * namespace at the time of open instead of write.
+ */
+static int test_cgcore_lesser_ns_open(const char *root)
+{
+	static char stack[65536];
+	const uid_t test_euid = 65534;	/* usually nobody, any !root is fine */
+	int ret = KSFT_FAIL;
+	char *cg_test_a = NULL, *cg_test_b = NULL;
+	char *cg_test_a_procs = NULL, *cg_test_b_procs = NULL;
+	int cg_test_b_procs_fd = -1;
+	struct lesser_ns_open_thread_arg targ = { .fd = -1 };
+	pid_t pid;
+	int status;
+
+	cg_test_a = cg_name(root, "cg_test_a");
+	cg_test_b = cg_name(root, "cg_test_b");
+
+	if (!cg_test_a || !cg_test_b)
+		goto cleanup;
+
+	cg_test_a_procs = cg_name(cg_test_a, "cgroup.procs");
+	cg_test_b_procs = cg_name(cg_test_b, "cgroup.procs");
+
+	if (!cg_test_a_procs || !cg_test_b_procs)
+		goto cleanup;
+
+	if (cg_create(cg_test_a) || cg_create(cg_test_b))
+		goto cleanup;
+
+	if (cg_enter_current(cg_test_b))
+		goto cleanup;
+
+	if (chown(cg_test_a_procs, test_euid, -1) ||
+	    chown(cg_test_b_procs, test_euid, -1))
+		goto cleanup;
+
+	targ.path = cg_test_b_procs;
+	pid = clone(lesser_ns_open_thread_fn, stack + sizeof(stack),
+		    CLONE_NEWCGROUP | CLONE_FILES | CLONE_VM | SIGCHLD,
+		    &targ);
+	if (pid < 0)
+		goto cleanup;
+
+	if (waitpid(pid, &status, 0) < 0)
+		goto cleanup;
+
+	if (!WIFEXITED(status))
+		goto cleanup;
+
+	cg_test_b_procs_fd = targ.fd;
+	if (cg_test_b_procs_fd < 0)
+		goto cleanup;
+
+	if (cg_enter_current(cg_test_a))
+		goto cleanup;
+
+	if ((status = write(cg_test_b_procs_fd, "0", 1)) >= 0 || errno != ENOENT)
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	cg_enter_current(root);
+	if (cg_test_b_procs_fd >= 0)
+		close(cg_test_b_procs_fd);
+	if (cg_test_b)
+		cg_destroy(cg_test_b);
+	if (cg_test_a)
+		cg_destroy(cg_test_a);
+	free(cg_test_b_procs);
+	free(cg_test_a_procs);
+	free(cg_test_b);
+	free(cg_test_a);
+	return ret;
+}
+
 #define T(x) { x, #x }
 struct corecg_test {
 	int (*fn)(const char *root);
@@ -757,6 +853,7 @@ struct corecg_test {
 	T(test_cgcore_thread_migration),
 	T(test_cgcore_destroy),
 	T(test_cgcore_lesser_euid_open),
+	T(test_cgcore_lesser_ns_open),
 };
 #undef T
 
-- 
cgit v1.2.3


From 70bc793382a0e37ba4e35e4d1a317b280b829a44 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 6 Jan 2022 12:51:56 -0800
Subject: selftests/bpf: Don't rely on preserving volatile in PT_REGS macros in
 loop3

PT_REGS*() macro on some architectures force-cast struct pt_regs to
other types (user_pt_regs, etc) and might drop volatile modifiers, if any.
Volatile isn't really required as pt_regs value isn't supposed to change
during the BPF program run, so this is correct behavior.

But progs/loop3.c relies on that volatile modifier to ensure that loop
is preserved. Fix loop3.c by declaring i and sum variables as volatile
instead. It preserves the loop and makes the test pass on all
architectures (including s390x which is currently broken).

Fixes: 3cc31d794097 ("libbpf: Normalize PT_REGS_xxx() macro definitions")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220106205156.955373-1-andrii@kernel.org
---
 tools/testing/selftests/bpf/progs/loop3.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/loop3.c b/tools/testing/selftests/bpf/progs/loop3.c
index 76e93b31c14b..717dab14322b 100644
--- a/tools/testing/selftests/bpf/progs/loop3.c
+++ b/tools/testing/selftests/bpf/progs/loop3.c
@@ -12,9 +12,9 @@
 char _license[] SEC("license") = "GPL";
 
 SEC("raw_tracepoint/consume_skb")
-int while_true(volatile struct pt_regs* ctx)
+int while_true(struct pt_regs *ctx)
 {
-	__u64 i = 0, sum = 0;
+	volatile __u64 i = 0, sum = 0;
 	do {
 		i++;
 		sum += PT_REGS_RC(ctx);
-- 
cgit v1.2.3


From e59618f0f46fa6cf86d5b82380e0f453756b282b Mon Sep 17 00:00:00 2001
From: Grant Seltzer <grantseltzer@gmail.com>
Date: Thu, 6 Jan 2022 15:13:05 -0500
Subject: libbpf: Add documentation for bpf_map batch operations

This adds documention for:

- bpf_map_delete_batch()
- bpf_map_lookup_batch()
- bpf_map_lookup_and_delete_batch()
- bpf_map_update_batch()

This also updates the public API for the `keys` parameter
of `bpf_map_delete_batch()`, and both the
`keys` and `values` parameters of `bpf_map_update_batch()`
to be constants.

Signed-off-by: Grant Seltzer <grantseltzer@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20220106201304.112675-1-grantseltzer@gmail.com
---
 tools/lib/bpf/bpf.c |   8 ++--
 tools/lib/bpf/bpf.h | 115 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 117 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 9b64eed2b003..550b4cbb6c99 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -691,11 +691,11 @@ static int bpf_map_batch_common(int cmd, int fd, void  *in_batch,
 	return libbpf_err_errno(ret);
 }
 
-int bpf_map_delete_batch(int fd, void *keys, __u32 *count,
+int bpf_map_delete_batch(int fd, const void *keys, __u32 *count,
 			 const struct bpf_map_batch_opts *opts)
 {
 	return bpf_map_batch_common(BPF_MAP_DELETE_BATCH, fd, NULL,
-				    NULL, keys, NULL, count, opts);
+				    NULL, (void *)keys, NULL, count, opts);
 }
 
 int bpf_map_lookup_batch(int fd, void *in_batch, void *out_batch, void *keys,
@@ -715,11 +715,11 @@ int bpf_map_lookup_and_delete_batch(int fd, void *in_batch, void *out_batch,
 				    count, opts);
 }
 
-int bpf_map_update_batch(int fd, void *keys, void *values, __u32 *count,
+int bpf_map_update_batch(int fd, const void *keys, const void *values, __u32 *count,
 			 const struct bpf_map_batch_opts *opts)
 {
 	return bpf_map_batch_common(BPF_MAP_UPDATE_BATCH, fd, NULL, NULL,
-				    keys, values, count, opts);
+				    (void *)keys, (void *)values, count, opts);
 }
 
 int bpf_obj_pin(int fd, const char *pathname)
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 00619f64a040..14e0d97ad2cf 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -254,17 +254,128 @@ struct bpf_map_batch_opts {
 };
 #define bpf_map_batch_opts__last_field flags
 
-LIBBPF_API int bpf_map_delete_batch(int fd, void *keys,
+
+/**
+ * @brief **bpf_map_delete_batch()** allows for batch deletion of multiple
+ * elements in a BPF map.
+ *
+ * @param fd BPF map file descriptor
+ * @param keys pointer to an array of *count* keys
+ * @param count input and output parameter; on input **count** represents the
+ * number of  elements in the map to delete in batch;
+ * on output if a non-EFAULT error is returned, **count** represents the number of deleted
+ * elements if the output **count** value is not equal to the input **count** value
+ * If EFAULT is returned, **count** should not be trusted to be correct.
+ * @param opts options for configuring the way the batch deletion works
+ * @return 0, on success; negative error code, otherwise (errno is also set to
+ * the error code)
+ */
+LIBBPF_API int bpf_map_delete_batch(int fd, const void *keys,
 				    __u32 *count,
 				    const struct bpf_map_batch_opts *opts);
+
+/**
+ * @brief **bpf_map_lookup_batch()** allows for batch lookup of BPF map elements.
+ *
+ * The parameter *in_batch* is the address of the first element in the batch to read.
+ * *out_batch* is an output parameter that should be passed as *in_batch* to subsequent
+ * calls to **bpf_map_lookup_batch()**. NULL can be passed for *in_batch* to indicate
+ * that the batched lookup starts from the beginning of the map.
+ *
+ * The *keys* and *values* are output parameters which must point to memory large enough to
+ * hold *count* items based on the key and value size of the map *map_fd*. The *keys*
+ * buffer must be of *key_size* * *count*. The *values* buffer must be of
+ * *value_size* * *count*.
+ *
+ * @param fd BPF map file descriptor
+ * @param in_batch address of the first element in batch to read, can pass NULL to
+ * indicate that the batched lookup starts from the beginning of the map.
+ * @param out_batch output parameter that should be passed to next call as *in_batch*
+ * @param keys pointer to an array large enough for *count* keys
+ * @param values pointer to an array large enough for *count* values
+ * @param count input and output parameter; on input it's the number of elements
+ * in the map to read in batch; on output it's the number of elements that were
+ * successfully read.
+ * If a non-EFAULT error is returned, count will be set as the number of elements
+ * that were read before the error occurred.
+ * If EFAULT is returned, **count** should not be trusted to be correct.
+ * @param opts options for configuring the way the batch lookup works
+ * @return 0, on success; negative error code, otherwise (errno is also set to
+ * the error code)
+ */
 LIBBPF_API int bpf_map_lookup_batch(int fd, void *in_batch, void *out_batch,
 				    void *keys, void *values, __u32 *count,
 				    const struct bpf_map_batch_opts *opts);
+
+/**
+ * @brief **bpf_map_lookup_and_delete_batch()** allows for batch lookup and deletion
+ * of BPF map elements where each element is deleted after being retrieved.
+ *
+ * @param fd BPF map file descriptor
+ * @param in_batch address of the first element in batch to read, can pass NULL to
+ * get address of the first element in *out_batch*
+ * @param out_batch output parameter that should be passed to next call as *in_batch*
+ * @param keys pointer to an array of *count* keys
+ * @param values pointer to an array large enough for *count* values
+ * @param count input and output parameter; on input it's the number of elements
+ * in the map to read and delete in batch; on output it represents the number of
+ * elements that were successfully read and deleted
+ * If a non-**EFAULT** error code is returned and if the output **count** value
+ * is not equal to the input **count** value, up to **count** elements may
+ * have been deleted.
+ * if **EFAULT** is returned up to *count* elements may have been deleted without
+ * being returned via the *keys* and *values* output parameters.
+ * @param opts options for configuring the way the batch lookup and delete works
+ * @return 0, on success; negative error code, otherwise (errno is also set to
+ * the error code)
+ */
 LIBBPF_API int bpf_map_lookup_and_delete_batch(int fd, void *in_batch,
 					void *out_batch, void *keys,
 					void *values, __u32 *count,
 					const struct bpf_map_batch_opts *opts);
-LIBBPF_API int bpf_map_update_batch(int fd, void *keys, void *values,
+
+/**
+ * @brief **bpf_map_update_batch()** updates multiple elements in a map
+ * by specifying keys and their corresponding values.
+ *
+ * The *keys* and *values* parameters must point to memory large enough
+ * to hold *count* items based on the key and value size of the map.
+ *
+ * The *opts* parameter can be used to control how *bpf_map_update_batch()*
+ * should handle keys that either do or do not already exist in the map.
+ * In particular the *flags* parameter of *bpf_map_batch_opts* can be
+ * one of the following:
+ *
+ * Note that *count* is an input and output parameter, where on output it
+ * represents how many elements were successfully updated. Also note that if
+ * **EFAULT** then *count* should not be trusted to be correct.
+ *
+ * **BPF_ANY**
+ *    Create new elements or update existing.
+ *
+ * **BPF_NOEXIST**
+ *    Create new elements only if they do not exist.
+ *
+ * **BPF_EXIST**
+ *    Update existing elements.
+ *
+ * **BPF_F_LOCK**
+ *    Update spin_lock-ed map elements. This must be
+ *    specified if the map value contains a spinlock.
+ *
+ * @param fd BPF map file descriptor
+ * @param keys pointer to an array of *count* keys
+ * @param values pointer to an array of *count* values
+ * @param count input and output parameter; on input it's the number of elements
+ * in the map to update in batch; on output if a non-EFAULT error is returned,
+ * **count** represents the number of updated elements if the output **count**
+ * value is not equal to the input **count** value.
+ * If EFAULT is returned, **count** should not be trusted to be correct.
+ * @param opts options for configuring the way the batch update works
+ * @return 0, on success; negative error code, otherwise (errno is also set to
+ * the error code)
+ */
+LIBBPF_API int bpf_map_update_batch(int fd, const void *keys, const void *values,
 				    __u32 *count,
 				    const struct bpf_map_batch_opts *opts);
 
-- 
cgit v1.2.3


From 44bab87d8ca6f0544a9f8fc97bdf33aa5b3c899e Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Thu, 6 Jan 2022 12:55:25 -0800
Subject: bpf/selftests: Test bpf_d_path on rdonly_mem.

The second parameter of bpf_d_path() can only accept writable
memories. Rdonly_mem obtained from bpf_per_cpu_ptr() can not
be passed into bpf_d_path for modification. This patch adds
a selftest to verify this behavior.

Signed-off-by: Hao Luo <haoluo@google.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20220106205525.2116218-1-haoluo@google.com
---
 tools/testing/selftests/bpf/prog_tests/d_path.c    | 22 ++++++++++++++++-
 .../bpf/progs/test_d_path_check_rdonly_mem.c       | 28 ++++++++++++++++++++++
 2 files changed, 49 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/progs/test_d_path_check_rdonly_mem.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/d_path.c b/tools/testing/selftests/bpf/prog_tests/d_path.c
index 0a577a248d34..32fc5b3b5cf6 100644
--- a/tools/testing/selftests/bpf/prog_tests/d_path.c
+++ b/tools/testing/selftests/bpf/prog_tests/d_path.c
@@ -9,6 +9,7 @@
 #define MAX_FILES		7
 
 #include "test_d_path.skel.h"
+#include "test_d_path_check_rdonly_mem.skel.h"
 
 static int duration;
 
@@ -99,7 +100,7 @@ out_close:
 	return ret;
 }
 
-void test_d_path(void)
+static void test_d_path_basic(void)
 {
 	struct test_d_path__bss *bss;
 	struct test_d_path *skel;
@@ -155,3 +156,22 @@ void test_d_path(void)
 cleanup:
 	test_d_path__destroy(skel);
 }
+
+static void test_d_path_check_rdonly_mem(void)
+{
+	struct test_d_path_check_rdonly_mem *skel;
+
+	skel = test_d_path_check_rdonly_mem__open_and_load();
+	ASSERT_ERR_PTR(skel, "unexpected_load_overwriting_rdonly_mem");
+
+	test_d_path_check_rdonly_mem__destroy(skel);
+}
+
+void test_d_path(void)
+{
+	if (test__start_subtest("basic"))
+		test_d_path_basic();
+
+	if (test__start_subtest("check_rdonly_mem"))
+		test_d_path_check_rdonly_mem();
+}
diff --git a/tools/testing/selftests/bpf/progs/test_d_path_check_rdonly_mem.c b/tools/testing/selftests/bpf/progs/test_d_path_check_rdonly_mem.c
new file mode 100644
index 000000000000..27c27cff6a3a
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_d_path_check_rdonly_mem.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Google */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+extern const int bpf_prog_active __ksym;
+
+SEC("fentry/security_inode_getattr")
+int BPF_PROG(d_path_check_rdonly_mem, struct path *path, struct kstat *stat,
+	     __u32 request_mask, unsigned int query_flags)
+{
+	void *active;
+	__u32 cpu;
+
+	cpu = bpf_get_smp_processor_id();
+	active = (void *)bpf_per_cpu_ptr(&bpf_prog_active, cpu);
+	if (active) {
+		/* FAIL here! 'active' points to readonly memory. bpf helpers
+		 * that update its arguments can not write into it.
+		 */
+		bpf_d_path(path, active, sizeof(int));
+	}
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 6fd92c7f0c3846340fee20f62dacb17d0a15c0d3 Mon Sep 17 00:00:00 2001
From: Menglong Dong <imagedong@tencent.com>
Date: Thu, 6 Jan 2022 21:20:21 +0800
Subject: bpf: selftests: Use C99 initializers in test_sock.c

Use C99 initializers for the initialization of 'tests' in test_sock.c.

Signed-off-by: Menglong Dong <imagedong@tencent.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220106132022.3470772-3-imagedong@tencent.com
---
 tools/testing/selftests/bpf/test_sock.c | 220 +++++++++++++-------------------
 1 file changed, 92 insertions(+), 128 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_sock.c b/tools/testing/selftests/bpf/test_sock.c
index e8edd3dd3ec2..94f9b126f5ed 100644
--- a/tools/testing/selftests/bpf/test_sock.c
+++ b/tools/testing/selftests/bpf/test_sock.c
@@ -46,7 +46,7 @@ struct sock_test {
 
 static struct sock_test tests[] = {
 	{
-		"bind4 load with invalid access: src_ip6",
+		.descr = "bind4 load with invalid access: src_ip6",
 		.insns = {
 			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
 			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
@@ -54,16 +54,12 @@ static struct sock_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		BPF_CGROUP_INET4_POST_BIND,
-		BPF_CGROUP_INET4_POST_BIND,
-		0,
-		0,
-		NULL,
-		0,
-		LOAD_REJECT,
+		.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.result = LOAD_REJECT,
 	},
 	{
-		"bind4 load with invalid access: mark",
+		.descr = "bind4 load with invalid access: mark",
 		.insns = {
 			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
 			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
@@ -71,16 +67,12 @@ static struct sock_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		BPF_CGROUP_INET4_POST_BIND,
-		BPF_CGROUP_INET4_POST_BIND,
-		0,
-		0,
-		NULL,
-		0,
-		LOAD_REJECT,
+		.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.result = LOAD_REJECT,
 	},
 	{
-		"bind6 load with invalid access: src_ip4",
+		.descr = "bind6 load with invalid access: src_ip4",
 		.insns = {
 			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
 			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
@@ -88,16 +80,12 @@ static struct sock_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		BPF_CGROUP_INET6_POST_BIND,
-		BPF_CGROUP_INET6_POST_BIND,
-		0,
-		0,
-		NULL,
-		0,
-		LOAD_REJECT,
+		.expected_attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.result = LOAD_REJECT,
 	},
 	{
-		"sock_create load with invalid access: src_port",
+		.descr = "sock_create load with invalid access: src_port",
 		.insns = {
 			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
 			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
@@ -105,128 +93,106 @@ static struct sock_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		BPF_CGROUP_INET_SOCK_CREATE,
-		BPF_CGROUP_INET_SOCK_CREATE,
-		0,
-		0,
-		NULL,
-		0,
-		LOAD_REJECT,
+		.expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE,
+		.attach_type = BPF_CGROUP_INET_SOCK_CREATE,
+		.result = LOAD_REJECT,
 	},
 	{
-		"sock_create load w/o expected_attach_type (compat mode)",
+		.descr = "sock_create load w/o expected_attach_type (compat mode)",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		0,
-		BPF_CGROUP_INET_SOCK_CREATE,
-		AF_INET,
-		SOCK_STREAM,
-		"127.0.0.1",
-		8097,
-		SUCCESS,
+		.expected_attach_type = 0,
+		.attach_type = BPF_CGROUP_INET_SOCK_CREATE,
+		.domain = AF_INET,
+		.type = SOCK_STREAM,
+		.ip = "127.0.0.1",
+		.port = 8097,
+		.result = SUCCESS,
 	},
 	{
-		"sock_create load w/ expected_attach_type",
+		.descr = "sock_create load w/ expected_attach_type",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		BPF_CGROUP_INET_SOCK_CREATE,
-		BPF_CGROUP_INET_SOCK_CREATE,
-		AF_INET,
-		SOCK_STREAM,
-		"127.0.0.1",
-		8097,
-		SUCCESS,
+		.expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE,
+		.attach_type = BPF_CGROUP_INET_SOCK_CREATE,
+		.domain = AF_INET,
+		.type = SOCK_STREAM,
+		.ip = "127.0.0.1",
+		.port = 8097,
+		.result = SUCCESS,
 	},
 	{
-		"attach type mismatch bind4 vs bind6",
+		.descr = "attach type mismatch bind4 vs bind6",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		BPF_CGROUP_INET4_POST_BIND,
-		BPF_CGROUP_INET6_POST_BIND,
-		0,
-		0,
-		NULL,
-		0,
-		ATTACH_REJECT,
+		.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.result = ATTACH_REJECT,
 	},
 	{
-		"attach type mismatch bind6 vs bind4",
+		.descr = "attach type mismatch bind6 vs bind4",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		BPF_CGROUP_INET6_POST_BIND,
-		BPF_CGROUP_INET4_POST_BIND,
-		0,
-		0,
-		NULL,
-		0,
-		ATTACH_REJECT,
+		.expected_attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.result = ATTACH_REJECT,
 	},
 	{
-		"attach type mismatch default vs bind4",
+		.descr = "attach type mismatch default vs bind4",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		0,
-		BPF_CGROUP_INET4_POST_BIND,
-		0,
-		0,
-		NULL,
-		0,
-		ATTACH_REJECT,
+		.expected_attach_type = 0,
+		.attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.result = ATTACH_REJECT,
 	},
 	{
-		"attach type mismatch bind6 vs sock_create",
+		.descr = "attach type mismatch bind6 vs sock_create",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		BPF_CGROUP_INET6_POST_BIND,
-		BPF_CGROUP_INET_SOCK_CREATE,
-		0,
-		0,
-		NULL,
-		0,
-		ATTACH_REJECT,
+		.expected_attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.attach_type = BPF_CGROUP_INET_SOCK_CREATE,
+		.result = ATTACH_REJECT,
 	},
 	{
-		"bind4 reject all",
+		.descr = "bind4 reject all",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_EXIT_INSN(),
 		},
-		BPF_CGROUP_INET4_POST_BIND,
-		BPF_CGROUP_INET4_POST_BIND,
-		AF_INET,
-		SOCK_STREAM,
-		"0.0.0.0",
-		0,
-		BIND_REJECT,
+		.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.domain = AF_INET,
+		.type = SOCK_STREAM,
+		.ip = "0.0.0.0",
+		.result = BIND_REJECT,
 	},
 	{
-		"bind6 reject all",
+		.descr = "bind6 reject all",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_EXIT_INSN(),
 		},
-		BPF_CGROUP_INET6_POST_BIND,
-		BPF_CGROUP_INET6_POST_BIND,
-		AF_INET6,
-		SOCK_STREAM,
-		"::",
-		0,
-		BIND_REJECT,
+		.expected_attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.domain = AF_INET6,
+		.type = SOCK_STREAM,
+		.ip = "::",
+		.result = BIND_REJECT,
 	},
 	{
-		"bind6 deny specific IP & port",
+		.descr = "bind6 deny specific IP & port",
 		.insns = {
 			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
 
@@ -247,16 +213,16 @@ static struct sock_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		BPF_CGROUP_INET6_POST_BIND,
-		BPF_CGROUP_INET6_POST_BIND,
-		AF_INET6,
-		SOCK_STREAM,
-		"::1",
-		8193,
-		BIND_REJECT,
+		.expected_attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.domain = AF_INET6,
+		.type = SOCK_STREAM,
+		.ip = "::1",
+		.port = 8193,
+		.result = BIND_REJECT,
 	},
 	{
-		"bind4 allow specific IP & port",
+		.descr = "bind4 allow specific IP & port",
 		.insns = {
 			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
 
@@ -277,41 +243,39 @@ static struct sock_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_EXIT_INSN(),
 		},
-		BPF_CGROUP_INET4_POST_BIND,
-		BPF_CGROUP_INET4_POST_BIND,
-		AF_INET,
-		SOCK_STREAM,
-		"127.0.0.1",
-		4098,
-		SUCCESS,
+		.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.domain = AF_INET,
+		.type = SOCK_STREAM,
+		.ip = "127.0.0.1",
+		.port = 4098,
+		.result = SUCCESS,
 	},
 	{
-		"bind4 allow all",
+		.descr = "bind4 allow all",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		BPF_CGROUP_INET4_POST_BIND,
-		BPF_CGROUP_INET4_POST_BIND,
-		AF_INET,
-		SOCK_STREAM,
-		"0.0.0.0",
-		0,
-		SUCCESS,
+		.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.domain = AF_INET,
+		.type = SOCK_STREAM,
+		.ip = "0.0.0.0",
+		.result = SUCCESS,
 	},
 	{
-		"bind6 allow all",
+		.descr = "bind6 allow all",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		BPF_CGROUP_INET6_POST_BIND,
-		BPF_CGROUP_INET6_POST_BIND,
-		AF_INET6,
-		SOCK_STREAM,
-		"::",
-		0,
-		SUCCESS,
+		.expected_attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.domain = AF_INET6,
+		.type = SOCK_STREAM,
+		.ip = "::",
+		.result = SUCCESS,
 	},
 };
 
-- 
cgit v1.2.3


From f7342481749365d9ac5f24fb971659a64e045bb5 Mon Sep 17 00:00:00 2001
From: Menglong Dong <imagedong@tencent.com>
Date: Thu, 6 Jan 2022 21:20:22 +0800
Subject: bpf: selftests: Add bind retry for post_bind{4, 6}

With previous patch, kernel is able to 'put_port' after sys_bind()
fails. Add the test for that case: rebind another port after
sys_bind() fails. If the bind success, it means previous bind
operation is already undoed.

Signed-off-by: Menglong Dong <imagedong@tencent.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220106132022.3470772-4-imagedong@tencent.com
---
 tools/testing/selftests/bpf/test_sock.c | 150 +++++++++++++++++++++++++++-----
 1 file changed, 130 insertions(+), 20 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_sock.c b/tools/testing/selftests/bpf/test_sock.c
index 94f9b126f5ed..fe10f8134278 100644
--- a/tools/testing/selftests/bpf/test_sock.c
+++ b/tools/testing/selftests/bpf/test_sock.c
@@ -35,12 +35,15 @@ struct sock_test {
 	/* Endpoint to bind() to */
 	const char *ip;
 	unsigned short port;
+	unsigned short port_retry;
 	/* Expected test result */
 	enum {
 		LOAD_REJECT,
 		ATTACH_REJECT,
 		BIND_REJECT,
 		SUCCESS,
+		RETRY_SUCCESS,
+		RETRY_REJECT
 	} result;
 };
 
@@ -251,6 +254,99 @@ static struct sock_test tests[] = {
 		.port = 4098,
 		.result = SUCCESS,
 	},
+	{
+		.descr = "bind4 deny specific IP & port of TCP, and retry",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+			/* if (ip == expected && port == expected) */
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+				    offsetof(struct bpf_sock, src_ip4)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7,
+				    __bpf_constant_ntohl(0x7F000001), 4),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+				    offsetof(struct bpf_sock, src_port)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x1002, 2),
+
+			/* return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_JMP_A(1),
+
+			/* else return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.domain = AF_INET,
+		.type = SOCK_STREAM,
+		.ip = "127.0.0.1",
+		.port = 4098,
+		.port_retry = 5000,
+		.result = RETRY_SUCCESS,
+	},
+	{
+		.descr = "bind4 deny specific IP & port of UDP, and retry",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+			/* if (ip == expected && port == expected) */
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+				    offsetof(struct bpf_sock, src_ip4)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7,
+				    __bpf_constant_ntohl(0x7F000001), 4),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+				    offsetof(struct bpf_sock, src_port)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x1002, 2),
+
+			/* return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_JMP_A(1),
+
+			/* else return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.domain = AF_INET,
+		.type = SOCK_DGRAM,
+		.ip = "127.0.0.1",
+		.port = 4098,
+		.port_retry = 5000,
+		.result = RETRY_SUCCESS,
+	},
+	{
+		.descr = "bind6 deny specific IP & port, and retry",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+			/* if (ip == expected && port == expected) */
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+				    offsetof(struct bpf_sock, src_ip6[3])),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7,
+				    __bpf_constant_ntohl(0x00000001), 4),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+				    offsetof(struct bpf_sock, src_port)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x2001, 2),
+
+			/* return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_JMP_A(1),
+
+			/* else return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.expected_attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.domain = AF_INET6,
+		.type = SOCK_STREAM,
+		.ip = "::1",
+		.port = 8193,
+		.port_retry = 9000,
+		.result = RETRY_SUCCESS,
+	},
 	{
 		.descr = "bind4 allow all",
 		.insns = {
@@ -315,14 +411,15 @@ static int attach_sock_prog(int cgfd, int progfd,
 	return bpf_prog_attach(progfd, cgfd, attach_type, BPF_F_ALLOW_OVERRIDE);
 }
 
-static int bind_sock(int domain, int type, const char *ip, unsigned short port)
+static int bind_sock(int domain, int type, const char *ip,
+		     unsigned short port, unsigned short port_retry)
 {
 	struct sockaddr_storage addr;
 	struct sockaddr_in6 *addr6;
 	struct sockaddr_in *addr4;
 	int sockfd = -1;
 	socklen_t len;
-	int err = 0;
+	int res = SUCCESS;
 
 	sockfd = socket(domain, type, 0);
 	if (sockfd < 0)
@@ -348,21 +445,44 @@ static int bind_sock(int domain, int type, const char *ip, unsigned short port)
 		goto err;
 	}
 
-	if (bind(sockfd, (const struct sockaddr *)&addr, len) == -1)
-		goto err;
+	if (bind(sockfd, (const struct sockaddr *)&addr, len) == -1) {
+		/* sys_bind() may fail for different reasons, errno has to be
+		 * checked to confirm that BPF program rejected it.
+		 */
+		if (errno != EPERM)
+			goto err;
+		if (port_retry)
+			goto retry;
+		res = BIND_REJECT;
+		goto out;
+	}
 
+	goto out;
+retry:
+	if (domain == AF_INET)
+		addr4->sin_port = htons(port_retry);
+	else
+		addr6->sin6_port = htons(port_retry);
+	if (bind(sockfd, (const struct sockaddr *)&addr, len) == -1) {
+		if (errno != EPERM)
+			goto err;
+		res = RETRY_REJECT;
+	} else {
+		res = RETRY_SUCCESS;
+	}
 	goto out;
 err:
-	err = -1;
+	res = -1;
 out:
 	close(sockfd);
-	return err;
+	return res;
 }
 
 static int run_test_case(int cgfd, const struct sock_test *test)
 {
 	int progfd = -1;
 	int err = 0;
+	int res;
 
 	printf("Test case: %s .. ", test->descr);
 	progfd = load_sock_prog(test->insns, test->expected_attach_type);
@@ -380,21 +500,11 @@ static int run_test_case(int cgfd, const struct sock_test *test)
 			goto err;
 	}
 
-	if (bind_sock(test->domain, test->type, test->ip, test->port) == -1) {
-		/* sys_bind() may fail for different reasons, errno has to be
-		 * checked to confirm that BPF program rejected it.
-		 */
-		if (test->result == BIND_REJECT && errno == EPERM)
-			goto out;
-		else
-			goto err;
-	}
-
+	res = bind_sock(test->domain, test->type, test->ip, test->port,
+			test->port_retry);
+	if (res > 0 && test->result == res)
+		goto out;
 
-	if (test->result != SUCCESS)
-		goto err;
-
-	goto out;
 err:
 	err = -1;
 out:
-- 
cgit v1.2.3


From 05be5e273c84f6329b3cac4f5bb7b1fe6eaefd5d Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Thu, 6 Jan 2022 16:20:19 -0800
Subject: selftests: mptcp: add disconnect tests

Performs several disconnect/reconnect on the same socket,
ensuring the overall transfer is succesful.

The new test leverages ioctl(SIOCOUTQ) to ensure all the
pending data is acked before disconnecting.

Additionally order alphabetically the test program arguments list
for better maintainability.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/mptcp/mptcp_connect.c  | 148 +++++++++++++++++----
 tools/testing/selftests/net/mptcp/mptcp_connect.sh |  39 +++++-
 2 files changed, 160 insertions(+), 27 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c
index a30e93c5c549..8628aa61b763 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.c
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c
@@ -16,6 +16,7 @@
 #include <unistd.h>
 #include <time.h>
 
+#include <sys/ioctl.h>
 #include <sys/poll.h>
 #include <sys/sendfile.h>
 #include <sys/stat.h>
@@ -28,6 +29,7 @@
 
 #include <linux/tcp.h>
 #include <linux/time_types.h>
+#include <linux/sockios.h>
 
 extern int optind;
 
@@ -68,6 +70,8 @@ static unsigned int cfg_time;
 static unsigned int cfg_do_w;
 static int cfg_wait;
 static uint32_t cfg_mark;
+static char *cfg_input;
+static int cfg_repeat = 1;
 
 struct cfg_cmsg_types {
 	unsigned int cmsg_enabled:1;
@@ -91,22 +95,31 @@ static struct cfg_sockopt_types cfg_sockopt_types;
 
 static void die_usage(void)
 {
-	fprintf(stderr, "Usage: mptcp_connect [-6] [-u] [-s MPTCP|TCP] [-p port] [-m mode]"
-		"[-l] [-w sec] [-t num] [-T num] connect_address\n");
+	fprintf(stderr, "Usage: mptcp_connect [-6] [-c cmsg] [-i file] [-I num] [-j] [-l] "
+		"[-m mode] [-M mark] [-o option] [-p port] [-P mode] [-j] [-l] [-r num] "
+		"[-s MPTCP|TCP] [-S num] [-r num] [-t num] [-T num] [-u] [-w sec] connect_address\n");
 	fprintf(stderr, "\t-6 use ipv6\n");
-	fprintf(stderr, "\t-t num -- set poll timeout to num\n");
-	fprintf(stderr, "\t-T num -- set expected runtime to num ms\n");
-	fprintf(stderr, "\t-S num -- set SO_SNDBUF to num\n");
-	fprintf(stderr, "\t-R num -- set SO_RCVBUF to num\n");
-	fprintf(stderr, "\t-p num -- use port num\n");
-	fprintf(stderr, "\t-s [MPTCP|TCP] -- use mptcp(default) or tcp sockets\n");
+	fprintf(stderr, "\t-c cmsg -- test cmsg type <cmsg>\n");
+	fprintf(stderr, "\t-i file -- read the data to send from the given file instead of stdin");
+	fprintf(stderr, "\t-I num -- repeat the transfer 'num' times. In listen mode accepts num "
+		"incoming connections, in client mode, disconnect and reconnect to the server\n");
+	fprintf(stderr, "\t-j     -- add additional sleep at connection start and tear down "
+		"-- for MPJ tests\n");
+	fprintf(stderr, "\t-l     -- listens mode, accepts incoming connection\n");
 	fprintf(stderr, "\t-m [poll|mmap|sendfile] -- use poll(default)/mmap+write/sendfile\n");
 	fprintf(stderr, "\t-M mark -- set socket packet mark\n");
-	fprintf(stderr, "\t-w num -- wait num sec before closing the socket\n");
-	fprintf(stderr, "\t-c cmsg -- test cmsg type <cmsg>\n");
 	fprintf(stderr, "\t-o option -- test sockopt <option>\n");
+	fprintf(stderr, "\t-p num -- use port num\n");
 	fprintf(stderr,
 		"\t-P [saveWithPeek|saveAfterPeek] -- save data with/after MSG_PEEK form tcp socket\n");
+	fprintf(stderr, "\t-t num -- set poll timeout to num\n");
+	fprintf(stderr, "\t-T num -- set expected runtime to num ms\n");
+	fprintf(stderr, "\t-r num -- enable slow mode, limiting each write to num bytes "
+		"-- for remove addr tests\n");
+	fprintf(stderr, "\t-R num -- set SO_RCVBUF to num\n");
+	fprintf(stderr, "\t-s [MPTCP|TCP] -- use mptcp(default) or tcp sockets\n");
+	fprintf(stderr, "\t-S num -- set SO_SNDBUF to num\n");
+	fprintf(stderr, "\t-w num -- wait num sec before closing the socket\n");
 	exit(1);
 }
 
@@ -310,7 +323,8 @@ static int sock_listen_mptcp(const char * const listenaddr,
 }
 
 static int sock_connect_mptcp(const char * const remoteaddr,
-			      const char * const port, int proto)
+			      const char * const port, int proto,
+			      struct addrinfo **peer)
 {
 	struct addrinfo hints = {
 		.ai_protocol = IPPROTO_TCP,
@@ -334,8 +348,10 @@ static int sock_connect_mptcp(const char * const remoteaddr,
 		if (cfg_mark)
 			set_mark(sock, cfg_mark);
 
-		if (connect(sock, a->ai_addr, a->ai_addrlen) == 0)
+		if (connect(sock, a->ai_addr, a->ai_addrlen) == 0) {
+			*peer = a;
 			break; /* success */
+		}
 
 		perror("connect()");
 		close(sock);
@@ -524,14 +540,17 @@ static ssize_t do_rnd_read(const int fd, char *buf, const size_t len)
 	return ret;
 }
 
-static void set_nonblock(int fd)
+static void set_nonblock(int fd, bool nonblock)
 {
 	int flags = fcntl(fd, F_GETFL);
 
 	if (flags == -1)
 		return;
 
-	fcntl(fd, F_SETFL, flags | O_NONBLOCK);
+	if (nonblock)
+		fcntl(fd, F_SETFL, flags | O_NONBLOCK);
+	else
+		fcntl(fd, F_SETFL, flags & ~O_NONBLOCK);
 }
 
 static int copyfd_io_poll(int infd, int peerfd, int outfd, bool *in_closed_after_out)
@@ -543,7 +562,7 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd, bool *in_closed_after
 	unsigned int woff = 0, wlen = 0;
 	char wbuf[8192];
 
-	set_nonblock(peerfd);
+	set_nonblock(peerfd, true);
 
 	for (;;) {
 		char rbuf[8192];
@@ -638,7 +657,6 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd, bool *in_closed_after
 	if (cfg_remove)
 		usleep(cfg_wait);
 
-	close(peerfd);
 	return 0;
 }
 
@@ -780,7 +798,7 @@ static int copyfd_io_sendfile(int infd, int peerfd, int outfd,
 	return err;
 }
 
-static int copyfd_io(int infd, int peerfd, int outfd)
+static int copyfd_io(int infd, int peerfd, int outfd, bool close_peerfd)
 {
 	bool in_closed_after_out = false;
 	struct timespec start, end;
@@ -819,6 +837,9 @@ static int copyfd_io(int infd, int peerfd, int outfd)
 	if (ret)
 		return ret;
 
+	if (close_peerfd)
+		close(peerfd);
+
 	if (cfg_time) {
 		unsigned int delta_ms;
 
@@ -930,7 +951,7 @@ static void maybe_close(int fd)
 {
 	unsigned int r = rand();
 
-	if (!(cfg_join || cfg_remove) && (r & 1))
+	if (!(cfg_join || cfg_remove || cfg_repeat > 1) && (r & 1))
 		close(fd);
 }
 
@@ -940,7 +961,9 @@ int main_loop_s(int listensock)
 	struct pollfd polls;
 	socklen_t salen;
 	int remotesock;
+	int fd = 0;
 
+again:
 	polls.fd = listensock;
 	polls.events = POLLIN;
 
@@ -961,14 +984,27 @@ int main_loop_s(int listensock)
 		check_sockaddr(pf, &ss, salen);
 		check_getpeername(remotesock, &ss, salen);
 
+		if (cfg_input) {
+			fd = open(cfg_input, O_RDONLY);
+			if (fd < 0)
+				xerror("can't open %s: %d", cfg_input, errno);
+		}
+
 		SOCK_TEST_TCPULP(remotesock, 0);
 
-		return copyfd_io(0, remotesock, 1);
+		copyfd_io(fd, remotesock, 1, true);
+	} else {
+		perror("accept");
+		return 1;
 	}
 
-	perror("accept");
+	if (--cfg_repeat > 0) {
+		if (cfg_input)
+			close(fd);
+		goto again;
+	}
 
-	return 1;
+	return 0;
 }
 
 static void init_rng(void)
@@ -1057,15 +1093,47 @@ static void parse_setsock_options(const char *name)
 	exit(1);
 }
 
+void xdisconnect(int fd, int addrlen)
+{
+	struct sockaddr_storage empty;
+	int msec_sleep = 10;
+	int queued = 1;
+	int i;
+
+	shutdown(fd, SHUT_WR);
+
+	/* while until the pending data is completely flushed, the later
+	 * disconnect will bypass/ignore/drop any pending data.
+	 */
+	for (i = 0; ; i += msec_sleep) {
+		if (ioctl(fd, SIOCOUTQ, &queued) < 0)
+			xerror("can't query out socket queue: %d", errno);
+
+		if (!queued)
+			break;
+
+		if (i > poll_timeout)
+			xerror("timeout while waiting for spool to complete");
+		usleep(msec_sleep * 1000);
+	}
+
+	memset(&empty, 0, sizeof(empty));
+	empty.ss_family = AF_UNSPEC;
+	if (connect(fd, (struct sockaddr *)&empty, addrlen) < 0)
+		xerror("can't disconnect: %d", errno);
+}
+
 int main_loop(void)
 {
-	int fd;
+	int fd, ret, fd_in = 0;
+	struct addrinfo *peer;
 
 	/* listener is ready. */
-	fd = sock_connect_mptcp(cfg_host, cfg_port, cfg_sock_proto);
+	fd = sock_connect_mptcp(cfg_host, cfg_port, cfg_sock_proto, &peer);
 	if (fd < 0)
 		return 2;
 
+again:
 	check_getpeername_connect(fd);
 
 	SOCK_TEST_TCPULP(fd, cfg_sock_proto);
@@ -1077,7 +1145,31 @@ int main_loop(void)
 	if (cfg_cmsg_types.cmsg_enabled)
 		apply_cmsg_types(fd, &cfg_cmsg_types);
 
-	return copyfd_io(0, fd, 1);
+	if (cfg_input) {
+		fd_in = open(cfg_input, O_RDONLY);
+		if (fd < 0)
+			xerror("can't open %s:%d", cfg_input, errno);
+	}
+
+	/* close the client socket open only if we are not going to reconnect */
+	ret = copyfd_io(fd_in, fd, 1, cfg_repeat == 1);
+	if (ret)
+		return ret;
+
+	if (--cfg_repeat > 0) {
+		xdisconnect(fd, peer->ai_addrlen);
+
+		/* the socket could be unblocking at this point, we need the
+		 * connect to be blocking
+		 */
+		set_nonblock(fd, false);
+		if (connect(fd, peer->ai_addr, peer->ai_addrlen))
+			xerror("can't reconnect: %d", errno);
+		if (cfg_input)
+			close(fd_in);
+		goto again;
+	}
+	return 0;
 }
 
 int parse_proto(const char *proto)
@@ -1162,7 +1254,7 @@ static void parse_opts(int argc, char **argv)
 {
 	int c;
 
-	while ((c = getopt(argc, argv, "6jr:lp:s:ht:T:m:S:R:w:M:P:c:o:")) != -1) {
+	while ((c = getopt(argc, argv, "6c:hi:I:jlm:M:o:p:P:r:R:s:S:t:T:w:")) != -1) {
 		switch (c) {
 		case 'j':
 			cfg_join = true;
@@ -1176,6 +1268,12 @@ static void parse_opts(int argc, char **argv)
 			if (cfg_do_w <= 0)
 				cfg_do_w = 50;
 			break;
+		case 'i':
+			cfg_input = optarg;
+			break;
+		case 'I':
+			cfg_repeat = atoi(optarg);
+			break;
 		case 'l':
 			listen_mode = true;
 			break;
diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
index ee28f82a6c89..cb5809b89081 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
@@ -7,6 +7,7 @@ optstring="S:R:d:e:l:r:h4cm:f:tC"
 ret=0
 sin=""
 sout=""
+cin_disconnect=""
 cin=""
 cout=""
 ksft_skip=4
@@ -24,6 +25,7 @@ options_log=true
 do_tcp=0
 checksum=false
 filesize=0
+connect_per_transfer=1
 
 if [ $tc_loss -eq 100 ];then
 	tc_loss=1%
@@ -127,6 +129,7 @@ TEST_COUNT=0
 
 cleanup()
 {
+	rm -f "$cin_disconnect" "$cout_disconnect"
 	rm -f "$cin" "$cout"
 	rm -f "$sin" "$sout"
 	rm -f "$capout"
@@ -149,6 +152,8 @@ sout=$(mktemp)
 cin=$(mktemp)
 cout=$(mktemp)
 capout=$(mktemp)
+cin_disconnect="$cin".disconnect
+cout_disconnect="$cout".disconnect
 trap cleanup EXIT
 
 for i in "$ns1" "$ns2" "$ns3" "$ns4";do
@@ -500,8 +505,8 @@ do_transfer()
 	cookies=${cookies##*=}
 
 	if [ ${cl_proto} = "MPTCP" ] && [ ${srv_proto} = "MPTCP" ]; then
-		expect_synrx=$((stat_synrx_last_l+1))
-		expect_ackrx=$((stat_ackrx_last_l+1))
+		expect_synrx=$((stat_synrx_last_l+$connect_per_transfer))
+		expect_ackrx=$((stat_ackrx_last_l+$connect_per_transfer))
 	fi
 
 	if [ ${stat_synrx_now_l} -lt ${expect_synrx} ]; then
@@ -738,6 +743,33 @@ run_tests_peekmode()
 	run_tests_lo "$ns1" "$ns1" dead:beef:1::1 1 "-P ${peekmode}"
 }
 
+run_tests_disconnect()
+{
+	local peekmode="$1"
+	local old_cin=$cin
+	local old_sin=$sin
+
+	cat $cin $cin $cin > "$cin".disconnect
+
+	# force do_transfer to cope with the multiple tranmissions
+	sin="$cin.disconnect"
+	sin_disconnect=$old_sin
+	cin="$cin.disconnect"
+	cin_disconnect="$old_cin"
+	connect_per_transfer=3
+
+	echo "INFO: disconnect"
+	run_tests_lo "$ns1" "$ns1" 10.0.1.1 1 "-I 3 -i $old_cin"
+	run_tests_lo "$ns1" "$ns1" dead:beef:1::1 1 "-I 3 -i $old_cin"
+
+	# restore previous status
+	cout=$old_cout
+	cout_disconnect="$cout".disconnect
+	cin=$old_cin
+	cin_disconnect="$cin".disconnect
+	connect_per_transfer=1
+}
+
 display_time()
 {
 	time_end=$(date +%s)
@@ -853,6 +885,9 @@ stop_if_error "Tests with peek mode have failed"
 # connect to ns4 ip address, ns2 should intercept/proxy
 run_test_transparent 10.0.3.1 "tproxy ipv4"
 run_test_transparent dead:beef:3::1 "tproxy ipv6"
+stop_if_error "Tests with tproxy have failed"
+
+run_tests_disconnect
 
 display_time
 exit $ret
-- 
cgit v1.2.3


From 86e39e04482b0aadf3ee3ed5fcf2d63816559d36 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Thu, 6 Jan 2022 16:20:22 -0800
Subject: mptcp: keep track of local endpoint still available for each msk

Include into the path manager status a bitmap tracking the list
of local endpoints still available - not yet used - for the
relevant mptcp socket.

Keep such map updated at endpoint creation/deletion time, so
that we can easily skip already used endpoint at local address
selection time.

The endpoint used by the initial subflow is lazyly accounted at
subflow creation time: the usage bitmap is be up2date before
endpoint selection and we avoid such unneeded task in some relevant
scenarios - e.g. busy servers accepting incoming subflows but
not creating any additional ones nor annuncing additional addresses.

Overall this allows for fair local endpoints usage in case of
subflow failure.

As a side effect, this patch also enforces that each endpoint
is used at most once for each mptcp connection.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mptcp/pm.c                                  |   1 +
 net/mptcp/pm_netlink.c                          | 125 ++++++++++++++----------
 net/mptcp/protocol.c                            |   3 +-
 net/mptcp/protocol.h                            |  12 ++-
 tools/testing/selftests/net/mptcp/mptcp_join.sh |   5 +-
 5 files changed, 91 insertions(+), 55 deletions(-)

(limited to 'tools')

diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 6b2bfe89d445..e9ba9551832c 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -370,6 +370,7 @@ void mptcp_pm_data_reset(struct mptcp_sock *msk)
 	WRITE_ONCE(msk->pm.accept_subflow, false);
 	WRITE_ONCE(msk->pm.remote_deny_join_id0, false);
 	msk->pm.status = 0;
+	bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
 
 	mptcp_pm_nl_data_init(msk);
 }
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 27427aeeee0e..ad3dc9c6c531 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -38,10 +38,6 @@ struct mptcp_pm_add_entry {
 	u8			retrans_times;
 };
 
-/* max value of mptcp_addr_info.id */
-#define MAX_ADDR_ID		U8_MAX
-#define BITMAP_SZ DIV_ROUND_UP(MAX_ADDR_ID + 1, BITS_PER_LONG)
-
 struct pm_nl_pernet {
 	/* protects pernet updates */
 	spinlock_t		lock;
@@ -53,14 +49,14 @@ struct pm_nl_pernet {
 	unsigned int		local_addr_max;
 	unsigned int		subflows_max;
 	unsigned int		next_id;
-	unsigned long		id_bitmap[BITMAP_SZ];
+	DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
 };
 
 #define MPTCP_PM_ADDR_MAX	8
 #define ADD_ADDR_RETRANS_MAX	3
 
 static bool addresses_equal(const struct mptcp_addr_info *a,
-			    struct mptcp_addr_info *b, bool use_port)
+			    const struct mptcp_addr_info *b, bool use_port)
 {
 	bool addr_equals = false;
 
@@ -174,6 +170,9 @@ select_local_address(const struct pm_nl_pernet *pernet,
 		if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW))
 			continue;
 
+		if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap))
+			continue;
+
 		if (entry->addr.family != sk->sk_family) {
 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
 			if ((entry->addr.family == AF_INET &&
@@ -184,23 +183,17 @@ select_local_address(const struct pm_nl_pernet *pernet,
 				continue;
 		}
 
-		/* avoid any address already in use by subflows and
-		 * pending join
-		 */
-		if (!lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) {
-			ret = entry;
-			break;
-		}
+		ret = entry;
+		break;
 	}
 	rcu_read_unlock();
 	return ret;
 }
 
 static struct mptcp_pm_addr_entry *
-select_signal_address(struct pm_nl_pernet *pernet, unsigned int pos)
+select_signal_address(struct pm_nl_pernet *pernet, struct mptcp_sock *msk)
 {
 	struct mptcp_pm_addr_entry *entry, *ret = NULL;
-	int i = 0;
 
 	rcu_read_lock();
 	/* do not keep any additional per socket state, just signal
@@ -209,12 +202,14 @@ select_signal_address(struct pm_nl_pernet *pernet, unsigned int pos)
 	 * can lead to additional addresses not being announced.
 	 */
 	list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
+		if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap))
+			continue;
+
 		if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL))
 			continue;
-		if (i++ == pos) {
-			ret = entry;
-			break;
-		}
+
+		ret = entry;
+		break;
 	}
 	rcu_read_unlock();
 	return ret;
@@ -258,9 +253,11 @@ EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max);
 
 static void check_work_pending(struct mptcp_sock *msk)
 {
-	if (msk->pm.add_addr_signaled == mptcp_pm_get_add_addr_signal_max(msk) &&
-	    (msk->pm.local_addr_used == mptcp_pm_get_local_addr_max(msk) ||
-	     msk->pm.subflows == mptcp_pm_get_subflows_max(msk)))
+	struct pm_nl_pernet *pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
+
+	if (msk->pm.subflows == mptcp_pm_get_subflows_max(msk) ||
+	    (find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap,
+			       MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1))
 		WRITE_ONCE(msk->pm.work_pending, false);
 }
 
@@ -460,6 +457,35 @@ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullm
 	return i;
 }
 
+static struct mptcp_pm_addr_entry *
+__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id)
+{
+	struct mptcp_pm_addr_entry *entry;
+
+	list_for_each_entry(entry, &pernet->local_addr_list, list) {
+		if (entry->addr.id == id)
+			return entry;
+	}
+	return NULL;
+}
+
+static int
+lookup_id_by_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *addr)
+{
+	struct mptcp_pm_addr_entry *entry;
+	int ret = -1;
+
+	rcu_read_lock();
+	list_for_each_entry(entry, &pernet->local_addr_list, list) {
+		if (addresses_equal(&entry->addr, addr, entry->addr.port)) {
+			ret = entry->addr.id;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
 static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
 {
 	struct sock *sk = (struct sock *)msk;
@@ -475,6 +501,19 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
 	local_addr_max = mptcp_pm_get_local_addr_max(msk);
 	subflows_max = mptcp_pm_get_subflows_max(msk);
 
+	/* do lazy endpoint usage accounting for the MPC subflows */
+	if (unlikely(!(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED))) && msk->first) {
+		struct mptcp_addr_info local;
+		int mpc_id;
+
+		local_address((struct sock_common *)msk->first, &local);
+		mpc_id = lookup_id_by_addr(pernet, &local);
+		if (mpc_id < 0)
+			__clear_bit(mpc_id, msk->pm.id_avail_bitmap);
+
+		msk->pm.status |= BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED);
+	}
+
 	pr_debug("local %d:%d signal %d:%d subflows %d:%d\n",
 		 msk->pm.local_addr_used, local_addr_max,
 		 msk->pm.add_addr_signaled, add_addr_signal_max,
@@ -482,21 +521,16 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
 
 	/* check first for announce */
 	if (msk->pm.add_addr_signaled < add_addr_signal_max) {
-		local = select_signal_address(pernet,
-					      msk->pm.add_addr_signaled);
+		local = select_signal_address(pernet, msk);
 
 		if (local) {
 			if (mptcp_pm_alloc_anno_list(msk, local)) {
+				__clear_bit(local->addr.id, msk->pm.id_avail_bitmap);
 				msk->pm.add_addr_signaled++;
 				mptcp_pm_announce_addr(msk, &local->addr, false);
 				mptcp_pm_nl_addr_send_ack(msk);
 			}
-		} else {
-			/* pick failed, avoid fourther attempts later */
-			msk->pm.local_addr_used = add_addr_signal_max;
 		}
-
-		check_work_pending(msk);
 	}
 
 	/* check if should create a new subflow */
@@ -510,19 +544,16 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
 			int i, nr;
 
 			msk->pm.local_addr_used++;
-			check_work_pending(msk);
 			nr = fill_remote_addresses_vec(msk, fullmesh, addrs);
+			if (nr)
+				__clear_bit(local->addr.id, msk->pm.id_avail_bitmap);
 			spin_unlock_bh(&msk->pm.lock);
 			for (i = 0; i < nr; i++)
 				__mptcp_subflow_connect(sk, &local->addr, &addrs[i]);
 			spin_lock_bh(&msk->pm.lock);
-			return;
 		}
-
-		/* lookup failed, avoid fourther attempts later */
-		msk->pm.local_addr_used = local_addr_max;
-		check_work_pending(msk);
 	}
+	check_work_pending(msk);
 }
 
 static void mptcp_pm_nl_fully_established(struct mptcp_sock *msk)
@@ -736,6 +767,7 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk,
 			msk->pm.subflows--;
 			__MPTCP_INC_STATS(sock_net(sk), rm_type);
 		}
+		__set_bit(rm_list->ids[1], msk->pm.id_avail_bitmap);
 		if (!removed)
 			continue;
 
@@ -765,6 +797,9 @@ void mptcp_pm_nl_work(struct mptcp_sock *msk)
 
 	msk_owned_by_me(msk);
 
+	if (!(pm->status & MPTCP_PM_WORK_MASK))
+		return;
+
 	spin_lock_bh(&msk->pm.lock);
 
 	pr_debug("msk=%p status=%x", msk, pm->status);
@@ -810,7 +845,7 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
 	/* to keep the code simple, don't do IDR-like allocation for address ID,
 	 * just bail when we exceed limits
 	 */
-	if (pernet->next_id == MAX_ADDR_ID)
+	if (pernet->next_id == MPTCP_PM_MAX_ADDR_ID)
 		pernet->next_id = 1;
 	if (pernet->addrs >= MPTCP_PM_ADDR_MAX)
 		goto out;
@@ -830,7 +865,7 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
 	if (!entry->addr.id) {
 find_next:
 		entry->addr.id = find_next_zero_bit(pernet->id_bitmap,
-						    MAX_ADDR_ID + 1,
+						    MPTCP_PM_MAX_ADDR_ID + 1,
 						    pernet->next_id);
 		if (!entry->addr.id && pernet->next_id != 1) {
 			pernet->next_id = 1;
@@ -1197,18 +1232,6 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info)
 	return 0;
 }
 
-static struct mptcp_pm_addr_entry *
-__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id)
-{
-	struct mptcp_pm_addr_entry *entry;
-
-	list_for_each_entry(entry, &pernet->local_addr_list, list) {
-		if (entry->addr.id == id)
-			return entry;
-	}
-	return NULL;
-}
-
 int mptcp_pm_get_flags_and_ifindex_by_id(struct net *net, unsigned int id,
 					 u8 *flags, int *ifindex)
 {
@@ -1467,7 +1490,7 @@ static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info)
 	list_splice_init(&pernet->local_addr_list, &free_list);
 	__reset_counters(pernet);
 	pernet->next_id = 1;
-	bitmap_zero(pernet->id_bitmap, MAX_ADDR_ID + 1);
+	bitmap_zero(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
 	spin_unlock_bh(&pernet->lock);
 	mptcp_nl_remove_addrs_list(sock_net(skb->sk), &free_list);
 	synchronize_rcu();
@@ -1577,7 +1600,7 @@ static int mptcp_nl_cmd_dump_addrs(struct sk_buff *msg,
 	pernet = net_generic(net, pm_nl_pernet_id);
 
 	spin_lock_bh(&pernet->lock);
-	for (i = id; i < MAX_ADDR_ID + 1; i++) {
+	for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) {
 		if (test_bit(i, pernet->id_bitmap)) {
 			entry = __lookup_addr_by_id(pernet, i);
 			if (!entry)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 667e153e6e24..5c956a8dc714 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -2488,8 +2488,7 @@ static void mptcp_worker(struct work_struct *work)
 
 	mptcp_check_fastclose(msk);
 
-	if (msk->pm.status)
-		mptcp_pm_nl_work(msk);
+	mptcp_pm_nl_work(msk);
 
 	if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
 		mptcp_check_for_eof(msk);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 386d1a98ff1d..2a6f0960ba27 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -173,16 +173,25 @@ enum mptcp_pm_status {
 	MPTCP_PM_ADD_ADDR_SEND_ACK,
 	MPTCP_PM_RM_ADDR_RECEIVED,
 	MPTCP_PM_ESTABLISHED,
-	MPTCP_PM_ALREADY_ESTABLISHED,	/* persistent status, set after ESTABLISHED event */
 	MPTCP_PM_SUBFLOW_ESTABLISHED,
+	MPTCP_PM_ALREADY_ESTABLISHED,	/* persistent status, set after ESTABLISHED event */
+	MPTCP_PM_MPC_ENDPOINT_ACCOUNTED /* persistent status, set after MPC local address is
+					 * accounted int id_avail_bitmap
+					 */
 };
 
+/* Status bits below MPTCP_PM_ALREADY_ESTABLISHED need pm worker actions */
+#define MPTCP_PM_WORK_MASK ((1 << MPTCP_PM_ALREADY_ESTABLISHED) - 1)
+
 enum mptcp_addr_signal_status {
 	MPTCP_ADD_ADDR_SIGNAL,
 	MPTCP_ADD_ADDR_ECHO,
 	MPTCP_RM_ADDR_SIGNAL,
 };
 
+/* max value of mptcp_addr_info.id */
+#define MPTCP_PM_MAX_ADDR_ID		U8_MAX
+
 struct mptcp_pm_data {
 	struct mptcp_addr_info local;
 	struct mptcp_addr_info remote;
@@ -201,6 +210,7 @@ struct mptcp_pm_data {
 	u8		local_addr_used;
 	u8		subflows;
 	u8		status;
+	DECLARE_BITMAP(id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
 	struct mptcp_rm_list rm_list_tx;
 	struct mptcp_rm_list rm_list_rx;
 };
diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 7ef639a9d4a6..bbafa4cf5453 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -1071,7 +1071,10 @@ signal_address_tests()
 	ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags signal
 	ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags signal
 	run_tests $ns1 $ns2 10.0.1.1
-	chk_add_nr 4 4
+
+	# the server will not signal the address terminating
+	# the MPC subflow
+	chk_add_nr 3 3
 }
 
 link_failure_tests()
-- 
cgit v1.2.3


From 46e967d187ed1693b34379ac4e27fd4e1fb40a45 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Thu, 6 Jan 2022 16:20:24 -0800
Subject: selftests: mptcp: add tests for subflow creation failure

Verify that, when multiple endpoints are available, subflows
creation proceed even when the first additional subflow creation
fails - due to packet drop on the relevant link

Co-developed-by: Geliang Tang <geliang.tang@suse.com>
Signed-off-by: Geliang Tang <geliang.tang@suse.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/mptcp/config        |  1 +
 tools/testing/selftests/net/mptcp/mptcp_join.sh | 78 ++++++++++++++++++++++++-
 2 files changed, 78 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/config b/tools/testing/selftests/net/mptcp/config
index d548bc139b5d..d36b7da5082a 100644
--- a/tools/testing/selftests/net/mptcp/config
+++ b/tools/testing/selftests/net/mptcp/config
@@ -17,4 +17,5 @@ CONFIG_NFT_TPROXY=m
 CONFIG_NFT_SOCKET=m
 CONFIG_IP_ADVANCED_ROUTER=y
 CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_NF_TARGET_REJECT=m
 CONFIG_IPV6_MULTIPLE_TABLES=y
diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index bbafa4cf5453..3165bd1a43cc 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -937,6 +937,22 @@ chk_link_usage()
 	fi
 }
 
+wait_for_tw()
+{
+	local timeout_ms=$((timeout_poll * 1000))
+	local time=0
+	local ns=$1
+
+	while [ $time -lt $timeout_ms ]; do
+		local cnt=$(ip netns exec $ns ss -t state time-wait |wc -l)
+
+		[ "$cnt" = 1 ] && return 1
+		time=$((time + 100))
+		sleep 0.1
+	done
+	return 1
+}
+
 subflows_tests()
 {
 	reset
@@ -994,6 +1010,61 @@ subflows_tests()
 	chk_join_nr "single subflow, dev" 1 1 1
 }
 
+subflows_error_tests()
+{
+	# If a single subflow is configured, and matches the MPC src
+	# address, no additional subflow should be created
+	reset
+	ip netns exec $ns1 ./pm_nl_ctl limits 0 1
+	ip netns exec $ns2 ./pm_nl_ctl limits 0 1
+	ip netns exec $ns2 ./pm_nl_ctl add 10.0.1.2 flags subflow
+	run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow
+	chk_join_nr "no MPC reuse with single endpoint" 0 0 0
+
+	# multiple subflows, with subflow creation error
+	reset
+	ip netns exec $ns1 ./pm_nl_ctl limits 0 2
+	ip netns exec $ns2 ./pm_nl_ctl limits 0 2
+	ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+	ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
+	ip netns exec $ns1 iptables -A INPUT -s 10.0.3.2 -p tcp -j REJECT
+	run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow
+	chk_join_nr "multi subflows, with failing subflow" 1 1 1
+
+	# multiple subflows, with subflow timeout on MPJ
+	reset
+	ip netns exec $ns1 ./pm_nl_ctl limits 0 2
+	ip netns exec $ns2 ./pm_nl_ctl limits 0 2
+	ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+	ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
+	ip netns exec $ns1 iptables -A INPUT -s 10.0.3.2 -p tcp -j DROP
+	run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow
+	chk_join_nr "multi subflows, with subflow timeout" 1 1 1
+
+	# multiple subflows, check that the endpoint corresponding to
+	# closed subflow (due to reset) is not reused if additional
+	# subflows are added later
+	reset
+	ip netns exec $ns1 ./pm_nl_ctl limits 0 1
+	ip netns exec $ns2 ./pm_nl_ctl limits 0 1
+	ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+	ip netns exec $ns1 iptables -A INPUT -s 10.0.3.2 -p tcp -j REJECT
+	run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow &
+
+	# updates in the child shell do not have any effect here, we
+	# need to bump the test counter for the above case
+	TEST_COUNT=$((TEST_COUNT+1))
+
+	# mpj subflow will be in TW after the reset
+	wait_for_tw $ns2
+	ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
+	wait
+
+	# additional subflow could be created only if the PM select
+	# the later endpoint, skipping the already used one
+	chk_join_nr "multi subflows, fair usage on close" 1 1 1
+}
+
 signal_address_tests()
 {
 	# add_address, unused
@@ -1805,6 +1876,7 @@ fullmesh_tests()
 all_tests()
 {
 	subflows_tests
+	subflows_error_tests
 	signal_address_tests
 	link_failure_tests
 	add_addr_timeout_tests
@@ -1824,6 +1896,7 @@ usage()
 {
 	echo "mptcp_join usage:"
 	echo "  -f subflows_tests"
+	echo "  -e subflows_error_tests"
 	echo "  -s signal_address_tests"
 	echo "  -l link_failure_tests"
 	echo "  -t add_addr_timeout_tests"
@@ -1872,11 +1945,14 @@ if [ $do_all_tests -eq 1 ]; then
 	exit $ret
 fi
 
-while getopts 'fsltra64bpkdmchCS' opt; do
+while getopts 'fesltra64bpkdmchCS' opt; do
 	case $opt in
 		f)
 			subflows_tests
 			;;
+		e)
+			subflows_error_tests
+			;;
 		s)
 			signal_address_tests
 			;;
-- 
cgit v1.2.3


From 4c66b56781eb114d478d8488e614a4f832a44a2c Mon Sep 17 00:00:00 2001
From: Peter Gonda <pgonda@google.com>
Date: Wed, 8 Dec 2021 11:16:40 -0800
Subject: selftests: KVM: sev_migrate_tests: Fix test_sev_mirror()

Mirrors should not be able to call LAUNCH_START. Remove the call on the
mirror to correct the test before fixing sev_ioctl() to correctly assert
on this failed ioctl.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Sean Christopherson <seanjc@google.com>
Cc: Marc Orr <marcorr@google.com>
Signed-off-by: Peter Gonda <pgonda@google.com>
Message-Id: <20211208191642.3792819-2-pgonda@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
index 29b18d565cf4..fbc742b42145 100644
--- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
+++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
@@ -228,9 +228,6 @@ static void sev_mirror_create(int dst_fd, int src_fd)
 static void test_sev_mirror(bool es)
 {
 	struct kvm_vm *src_vm, *dst_vm;
-	struct kvm_sev_launch_start start = {
-		.policy = es ? SEV_POLICY_ES : 0
-	};
 	int i;
 
 	src_vm = sev_vm_create(es);
@@ -241,7 +238,7 @@ static void test_sev_mirror(bool es)
 	/* Check that we can complete creation of the mirror VM.  */
 	for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i)
 		vm_vcpu_add(dst_vm, i);
-	sev_ioctl(dst_vm->fd, KVM_SEV_LAUNCH_START, &start);
+
 	if (es)
 		sev_ioctl(dst_vm->fd, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL);
 
-- 
cgit v1.2.3


From 427d046a41bbad295552fa0a7496e15d98eae389 Mon Sep 17 00:00:00 2001
From: Peter Gonda <pgonda@google.com>
Date: Wed, 8 Dec 2021 11:16:41 -0800
Subject: selftests: KVM: sev_migrate_tests: Fix sev_ioctl()

TEST_ASSERT in SEV ioctl was allowing errors because it checked return
value was good OR the FW error code was OK. This TEST_ASSERT should
require both (aka. AND) values are OK. Removes the LAUNCH_START from the
mirror VM because this call correctly fails because mirror VMs cannot
call this command. Currently issues with the PSP driver functions mean
the firmware error is not always reset to SEV_RET_SUCCESS when a call is
successful. Mainly sev_platform_init() doesn't correctly set the fw
error if the platform has already been initialized.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Sean Christopherson <seanjc@google.com>
Cc: Marc Orr <marcorr@google.com>
Signed-off-by: Peter Gonda <pgonda@google.com>
Message-Id: <20211208191642.3792819-3-pgonda@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
index fbc742b42145..4bb960ca6486 100644
--- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
+++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
@@ -30,8 +30,9 @@ static void sev_ioctl(int vm_fd, int cmd_id, void *data)
 	};
 	int ret;
 
+
 	ret = ioctl(vm_fd, KVM_MEMORY_ENCRYPT_OP, &cmd);
-	TEST_ASSERT((ret == 0 || cmd.error == SEV_RET_SUCCESS),
+	TEST_ASSERT(ret == 0 && cmd.error == SEV_RET_SUCCESS,
 		    "%d failed: return code: %d, errno: %d, fw error: %d",
 		    cmd_id, ret, errno, cmd.error);
 }
-- 
cgit v1.2.3


From a6fec53947cf48e14cc41e2a69dd7d2aa7a00e65 Mon Sep 17 00:00:00 2001
From: Peter Gonda <pgonda@google.com>
Date: Wed, 8 Dec 2021 11:16:42 -0800
Subject: selftests: KVM: sev_migrate_tests: Add mirror command tests

Add tests to confirm mirror vms can only run correct subset of commands.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Sean Christopherson <seanjc@google.com>
Cc: Marc Orr <marcorr@google.com>
Signed-off-by: Peter Gonda <pgonda@google.com>
Message-Id: <20211208191642.3792819-4-pgonda@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 .../selftests/kvm/x86_64/sev_migrate_tests.c       | 55 ++++++++++++++++++++--
 1 file changed, 51 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
index 4bb960ca6486..80056bbbb003 100644
--- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
+++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
@@ -21,7 +21,7 @@
 #define NR_LOCK_TESTING_THREADS 3
 #define NR_LOCK_TESTING_ITERATIONS 10000
 
-static void sev_ioctl(int vm_fd, int cmd_id, void *data)
+static int __sev_ioctl(int vm_fd, int cmd_id, void *data, __u32 *fw_error)
 {
 	struct kvm_sev_cmd cmd = {
 		.id = cmd_id,
@@ -30,11 +30,20 @@ static void sev_ioctl(int vm_fd, int cmd_id, void *data)
 	};
 	int ret;
 
-
 	ret = ioctl(vm_fd, KVM_MEMORY_ENCRYPT_OP, &cmd);
-	TEST_ASSERT(ret == 0 && cmd.error == SEV_RET_SUCCESS,
+	*fw_error = cmd.error;
+	return ret;
+}
+
+static void sev_ioctl(int vm_fd, int cmd_id, void *data)
+{
+	int ret;
+	__u32 fw_error;
+
+	ret = __sev_ioctl(vm_fd, cmd_id, data, &fw_error);
+	TEST_ASSERT(ret == 0 && fw_error == SEV_RET_SUCCESS,
 		    "%d failed: return code: %d, errno: %d, fw error: %d",
-		    cmd_id, ret, errno, cmd.error);
+		    cmd_id, ret, errno, fw_error);
 }
 
 static struct kvm_vm *sev_vm_create(bool es)
@@ -226,6 +235,42 @@ static void sev_mirror_create(int dst_fd, int src_fd)
 	TEST_ASSERT(!ret, "Copying context failed, ret: %d, errno: %d\n", ret, errno);
 }
 
+static void verify_mirror_allowed_cmds(int vm_fd)
+{
+	struct kvm_sev_guest_status status;
+
+	for (int cmd_id = KVM_SEV_INIT; cmd_id < KVM_SEV_NR_MAX; ++cmd_id) {
+		int ret;
+		__u32 fw_error;
+
+		/*
+		 * These commands are allowed for mirror VMs, all others are
+		 * not.
+		 */
+		switch (cmd_id) {
+		case KVM_SEV_LAUNCH_UPDATE_VMSA:
+		case KVM_SEV_GUEST_STATUS:
+		case KVM_SEV_DBG_DECRYPT:
+		case KVM_SEV_DBG_ENCRYPT:
+			continue;
+		default:
+			break;
+		}
+
+		/*
+		 * These commands should be disallowed before the data
+		 * parameter is examined so NULL is OK here.
+		 */
+		ret = __sev_ioctl(vm_fd, cmd_id, NULL, &fw_error);
+		TEST_ASSERT(
+			ret == -1 && errno == EINVAL,
+			"Should not be able call command: %d. ret: %d, errno: %d\n",
+			cmd_id, ret, errno);
+	}
+
+	sev_ioctl(vm_fd, KVM_SEV_GUEST_STATUS, &status);
+}
+
 static void test_sev_mirror(bool es)
 {
 	struct kvm_vm *src_vm, *dst_vm;
@@ -243,6 +288,8 @@ static void test_sev_mirror(bool es)
 	if (es)
 		sev_ioctl(dst_vm->fd, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL);
 
+	verify_mirror_allowed_cmds(dst_vm->fd);
+
 	kvm_vm_free(src_vm);
 	kvm_vm_free(dst_vm);
 }
-- 
cgit v1.2.3


From 14243b387137a4afbe1df5d9dc15182d6657bb79 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Fri, 10 Dec 2021 16:36:23 +0000
Subject: KVM: x86/xen: Add KVM_IRQ_ROUTING_XEN_EVTCHN and event channel
 delivery

This adds basic support for delivering 2 level event channels to a guest.

Initially, it only supports delivery via the IRQ routing table, triggered
by an eventfd. In order to do so, it has a kvm_xen_set_evtchn_fast()
function which will use the pre-mapped shared_info page if it already
exists and is still valid, while the slow path through the irqfd_inject
workqueue will remap the shared_info page if necessary.

It sets the bits in the shared_info page but not the vcpu_info; that is
deferred to __kvm_xen_has_interrupt() which raises the vector to the
appropriate vCPU.

Add a 'verbose' mode to xen_shinfo_test while adding test cases for this.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Message-Id: <20211210163625.2886-5-dwmw2@infradead.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst                     |  21 ++
 arch/x86/include/asm/kvm_host.h                    |   1 +
 arch/x86/kvm/irq_comm.c                            |  12 +
 arch/x86/kvm/x86.c                                 |   3 +-
 arch/x86/kvm/xen.c                                 | 262 ++++++++++++++++++++-
 arch/x86/kvm/xen.h                                 |   9 +
 include/linux/kvm_host.h                           |   7 +
 include/uapi/linux/kvm.h                           |  11 +
 .../testing/selftests/kvm/x86_64/xen_shinfo_test.c | 184 ++++++++++++++-
 9 files changed, 503 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index c168be764707..6b683dfea8f2 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -1799,6 +1799,7 @@ No flags are specified so far, the corresponding field must be set to zero.
 		struct kvm_irq_routing_msi msi;
 		struct kvm_irq_routing_s390_adapter adapter;
 		struct kvm_irq_routing_hv_sint hv_sint;
+		struct kvm_irq_routing_xen_evtchn xen_evtchn;
 		__u32 pad[8];
 	} u;
   };
@@ -1808,6 +1809,7 @@ No flags are specified so far, the corresponding field must be set to zero.
   #define KVM_IRQ_ROUTING_MSI 2
   #define KVM_IRQ_ROUTING_S390_ADAPTER 3
   #define KVM_IRQ_ROUTING_HV_SINT 4
+  #define KVM_IRQ_ROUTING_XEN_EVTCHN 5
 
 flags:
 
@@ -1859,6 +1861,20 @@ address_hi must be zero.
 	__u32 sint;
   };
 
+  struct kvm_irq_routing_xen_evtchn {
+	__u32 port;
+	__u32 vcpu;
+	__u32 priority;
+  };
+
+
+When KVM_CAP_XEN_HVM includes the KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL bit
+in its indication of supported features, routing to Xen event channels
+is supported. Although the priority field is present, only the value
+KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL is supported, which means delivery by
+2 level event channels. FIFO event channel support may be added in
+the future.
+
 
 4.55 KVM_SET_TSC_KHZ
 --------------------
@@ -7413,6 +7429,7 @@ PVHVM guests. Valid flags are::
   #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL	(1 << 1)
   #define KVM_XEN_HVM_CONFIG_SHARED_INFO	(1 << 2)
   #define KVM_XEN_HVM_CONFIG_RUNSTATE		(1 << 2)
+  #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL	(1 << 3)
 
 The KVM_XEN_HVM_CONFIG_HYPERCALL_MSR flag indicates that the KVM_XEN_HVM_CONFIG
 ioctl is available, for the guest to set its hypercall page.
@@ -7432,6 +7449,10 @@ The KVM_XEN_HVM_CONFIG_RUNSTATE flag indicates that the runstate-related
 features KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR/_CURRENT/_DATA/_ADJUST are
 supported by the KVM_XEN_VCPU_SET_ATTR/KVM_XEN_VCPU_GET_ATTR ioctls.
 
+The KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL flag indicates that IRQ routing entries
+of the type KVM_IRQ_ROUTING_XEN_EVTCHN are supported, with the priority
+field set to indicate 2 level event channel delivery.
+
 8.31 KVM_CAP_PPC_MULTITCE
 -------------------------
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6e61e11e750f..623fb7c4992c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -610,6 +610,7 @@ struct kvm_vcpu_xen {
 	u64 last_steal;
 	u64 runstate_entry_time;
 	u64 runstate_times[4];
+	unsigned long evtchn_pending_sel;
 };
 
 struct kvm_vcpu_arch {
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 39ad02d6dc63..6e0dab04320e 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -24,6 +24,7 @@
 
 #include "hyperv.h"
 #include "x86.h"
+#include "xen.h"
 
 static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
 			   struct kvm *kvm, int irq_source_id, int level,
@@ -175,6 +176,13 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
 			return r;
 		break;
 
+#ifdef CONFIG_KVM_XEN
+	case KVM_IRQ_ROUTING_XEN_EVTCHN:
+		if (!level)
+			return -1;
+
+		return kvm_xen_set_evtchn_fast(e, kvm);
+#endif
 	default:
 		break;
 	}
@@ -310,6 +318,10 @@ int kvm_set_routing_entry(struct kvm *kvm,
 		e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
 		e->hv_sint.sint = ue->u.hv_sint.sint;
 		break;
+#ifdef CONFIG_KVM_XEN
+	case KVM_IRQ_ROUTING_XEN_EVTCHN:
+		return kvm_xen_setup_evtchn(kvm, e, ue);
+#endif
 	default:
 		return -EINVAL;
 	}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 42bde45a1bc2..3050601d5d73 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4188,7 +4188,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_XEN_HVM:
 		r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
 		    KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
-		    KVM_XEN_HVM_CONFIG_SHARED_INFO;
+		    KVM_XEN_HVM_CONFIG_SHARED_INFO |
+		    KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL;
 		if (sched_info_on())
 			r |= KVM_XEN_HVM_CONFIG_RUNSTATE;
 		break;
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index da4bf2c6407f..ceddabd1f5c6 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -16,6 +16,7 @@
 #include <trace/events/kvm.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/vcpu.h>
+#include <xen/interface/event_channel.h>
 
 #include "trace.h"
 
@@ -195,6 +196,8 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 
 int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
 {
+	unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel);
+	bool atomic = in_atomic() || !task_is_running(current);
 	int err;
 	u8 rc = 0;
 
@@ -204,6 +207,9 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
 	 */
 	struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache;
 	struct kvm_memslots *slots = kvm_memslots(v->kvm);
+	bool ghc_valid = slots->generation == ghc->generation &&
+		!kvm_is_error_hva(ghc->hva) && ghc->memslot;
+
 	unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending);
 
 	/* No need for compat handling here */
@@ -219,8 +225,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
 	 * cache in kvm_read_guest_offset_cached(), but just uses
 	 * __get_user() instead. And falls back to the slow path.
 	 */
-	if (likely(slots->generation == ghc->generation &&
-		   !kvm_is_error_hva(ghc->hva) && ghc->memslot)) {
+	if (!evtchn_pending_sel && ghc_valid) {
 		/* Fast path */
 		pagefault_disable();
 		err = __get_user(rc, (u8 __user *)ghc->hva + offset);
@@ -239,11 +244,82 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
 	 * and we'll end up getting called again from a context where we *can*
 	 * fault in the page and wait for it.
 	 */
-	if (in_atomic() || !task_is_running(current))
+	if (atomic)
 		return 1;
 
-	kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset,
-				     sizeof(rc));
+	if (!ghc_valid) {
+		err = kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len);
+		if (err || !ghc->memslot) {
+			/*
+			 * If this failed, userspace has screwed up the
+			 * vcpu_info mapping. No interrupts for you.
+			 */
+			return 0;
+		}
+	}
+
+	/*
+	 * Now we have a valid (protected by srcu) userspace HVA in
+	 * ghc->hva which points to the struct vcpu_info. If there
+	 * are any bits in the in-kernel evtchn_pending_sel then
+	 * we need to write those to the guest vcpu_info and set
+	 * its evtchn_upcall_pending flag. If there aren't any bits
+	 * to add, we only want to *check* evtchn_upcall_pending.
+	 */
+	if (evtchn_pending_sel) {
+		bool long_mode = v->kvm->arch.xen.long_mode;
+
+		if (!user_access_begin((void __user *)ghc->hva, sizeof(struct vcpu_info)))
+			return 0;
+
+		if (IS_ENABLED(CONFIG_64BIT) && long_mode) {
+			struct vcpu_info __user *vi = (void __user *)ghc->hva;
+
+			/* Attempt to set the evtchn_pending_sel bits in the
+			 * guest, and if that succeeds then clear the same
+			 * bits in the in-kernel version. */
+			asm volatile("1:\t" LOCK_PREFIX "orq %0, %1\n"
+				     "\tnotq %0\n"
+				     "\t" LOCK_PREFIX "andq %0, %2\n"
+				     "2:\n"
+				     "\t.section .fixup,\"ax\"\n"
+				     "3:\tjmp\t2b\n"
+				     "\t.previous\n"
+				     _ASM_EXTABLE_UA(1b, 3b)
+				     : "=r" (evtchn_pending_sel),
+				       "+m" (vi->evtchn_pending_sel),
+				       "+m" (v->arch.xen.evtchn_pending_sel)
+				     : "0" (evtchn_pending_sel));
+		} else {
+			struct compat_vcpu_info __user *vi = (void __user *)ghc->hva;
+			u32 evtchn_pending_sel32 = evtchn_pending_sel;
+
+			/* Attempt to set the evtchn_pending_sel bits in the
+			 * guest, and if that succeeds then clear the same
+			 * bits in the in-kernel version. */
+			asm volatile("1:\t" LOCK_PREFIX "orl %0, %1\n"
+				     "\tnotl %0\n"
+				     "\t" LOCK_PREFIX "andl %0, %2\n"
+				     "2:\n"
+				     "\t.section .fixup,\"ax\"\n"
+				     "3:\tjmp\t2b\n"
+				     "\t.previous\n"
+				     _ASM_EXTABLE_UA(1b, 3b)
+				     : "=r" (evtchn_pending_sel32),
+				       "+m" (vi->evtchn_pending_sel),
+				       "+m" (v->arch.xen.evtchn_pending_sel)
+				     : "0" (evtchn_pending_sel32));
+		}
+		rc = 1;
+		unsafe_put_user(rc, (u8 __user *)ghc->hva + offset, err);
+
+	err:
+		user_access_end();
+
+		mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
+	} else {
+		__get_user(rc, (u8 __user *)ghc->hva + offset);
+	}
 
 	return rc;
 }
@@ -740,3 +816,179 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
 
 	return 0;
 }
+
+static inline int max_evtchn_port(struct kvm *kvm)
+{
+	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
+		return EVTCHN_2L_NR_CHANNELS;
+	else
+		return COMPAT_EVTCHN_2L_NR_CHANNELS;
+}
+
+/*
+ * This follows the kvm_set_irq() API, so it returns:
+ *  < 0   Interrupt was ignored (masked or not delivered for other reasons)
+ *  = 0   Interrupt was coalesced (previous irq is still pending)
+ *  > 0   Number of CPUs interrupt was delivered to
+ */
+int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e,
+			    struct kvm *kvm)
+{
+	struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
+	struct kvm_vcpu *vcpu;
+	unsigned long *pending_bits, *mask_bits;
+	unsigned long flags;
+	int port_word_bit;
+	bool kick_vcpu = false;
+	int idx;
+	int rc;
+
+	vcpu = kvm_get_vcpu_by_id(kvm, e->xen_evtchn.vcpu);
+	if (!vcpu)
+		return -1;
+
+	if (!vcpu->arch.xen.vcpu_info_set)
+		return -1;
+
+	if (e->xen_evtchn.port >= max_evtchn_port(kvm))
+		return -1;
+
+	rc = -EWOULDBLOCK;
+	read_lock_irqsave(&gpc->lock, flags);
+
+	idx = srcu_read_lock(&kvm->srcu);
+	if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE))
+		goto out_rcu;
+
+	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
+		struct shared_info *shinfo = gpc->khva;
+		pending_bits = (unsigned long *)&shinfo->evtchn_pending;
+		mask_bits = (unsigned long *)&shinfo->evtchn_mask;
+		port_word_bit = e->xen_evtchn.port / 64;
+	} else {
+		struct compat_shared_info *shinfo = gpc->khva;
+		pending_bits = (unsigned long *)&shinfo->evtchn_pending;
+		mask_bits = (unsigned long *)&shinfo->evtchn_mask;
+		port_word_bit = e->xen_evtchn.port / 32;
+	}
+
+	/*
+	 * If this port wasn't already set, and if it isn't masked, then
+	 * we try to set the corresponding bit in the in-kernel shadow of
+	 * evtchn_pending_sel for the target vCPU. And if *that* wasn't
+	 * already set, then we kick the vCPU in question to write to the
+	 * *real* evtchn_pending_sel in its own guest vcpu_info struct.
+	 */
+	if (test_and_set_bit(e->xen_evtchn.port, pending_bits)) {
+		rc = 0; /* It was already raised */
+	} else if (test_bit(e->xen_evtchn.port, mask_bits)) {
+		rc = -1; /* Masked */
+	} else {
+		rc = 1; /* Delivered. But was the vCPU waking already? */
+		if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel))
+			kick_vcpu = true;
+	}
+
+ out_rcu:
+	srcu_read_unlock(&kvm->srcu, idx);
+	read_unlock_irqrestore(&gpc->lock, flags);
+
+	if (kick_vcpu) {
+		kvm_make_request(KVM_REQ_EVENT, vcpu);
+		kvm_vcpu_kick(vcpu);
+	}
+
+	return rc;
+}
+
+/* This is the version called from kvm_set_irq() as the .set function */
+static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+			 int irq_source_id, int level, bool line_status)
+{
+	bool mm_borrowed = false;
+	int rc;
+
+	if (!level)
+		return -1;
+
+	rc = kvm_xen_set_evtchn_fast(e, kvm);
+	if (rc != -EWOULDBLOCK)
+		return rc;
+
+	if (current->mm != kvm->mm) {
+		/*
+		 * If not on a thread which already belongs to this KVM,
+		 * we'd better be in the irqfd workqueue.
+		 */
+		if (WARN_ON_ONCE(current->mm))
+			return -EINVAL;
+
+		kthread_use_mm(kvm->mm);
+		mm_borrowed = true;
+	}
+
+	/*
+	 * For the irqfd workqueue, using the main kvm->lock mutex is
+	 * fine since this function is invoked from kvm_set_irq() with
+	 * no other lock held, no srcu. In future if it will be called
+	 * directly from a vCPU thread (e.g. on hypercall for an IPI)
+	 * then it may need to switch to using a leaf-node mutex for
+	 * serializing the shared_info mapping.
+	 */
+	mutex_lock(&kvm->lock);
+
+	/*
+	 * It is theoretically possible for the page to be unmapped
+	 * and the MMU notifier to invalidate the shared_info before
+	 * we even get to use it. In that case, this looks like an
+	 * infinite loop. It was tempting to do it via the userspace
+	 * HVA instead... but that just *hides* the fact that it's
+	 * an infinite loop, because if a fault occurs and it waits
+	 * for the page to come back, it can *still* immediately
+	 * fault and have to wait again, repeatedly.
+	 *
+	 * Conversely, the page could also have been reinstated by
+	 * another thread before we even obtain the mutex above, so
+	 * check again *first* before remapping it.
+	 */
+	do {
+		struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
+		int idx;
+
+		rc = kvm_xen_set_evtchn_fast(e, kvm);
+		if (rc != -EWOULDBLOCK)
+			break;
+
+		idx = srcu_read_lock(&kvm->srcu);
+		rc = kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpc->gpa,
+						  PAGE_SIZE, false);
+		srcu_read_unlock(&kvm->srcu, idx);
+	} while(!rc);
+
+	mutex_unlock(&kvm->lock);
+
+	if (mm_borrowed)
+		kthread_unuse_mm(kvm->mm);
+
+	return rc;
+}
+
+int kvm_xen_setup_evtchn(struct kvm *kvm,
+			 struct kvm_kernel_irq_routing_entry *e,
+			 const struct kvm_irq_routing_entry *ue)
+
+{
+	if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm))
+		return -EINVAL;
+
+	/* We only support 2 level event channels for now */
+	if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
+		return -EINVAL;
+
+	e->xen_evtchn.port = ue->u.xen_evtchn.port;
+	e->xen_evtchn.vcpu = ue->u.xen_evtchn.vcpu;
+	e->xen_evtchn.priority = ue->u.xen_evtchn.priority;
+	e->set = evtchn_set_fn;
+
+	return 0;
+}
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index cc0cf5f37450..adbcc9ed59db 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -24,6 +24,12 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc);
 void kvm_xen_init_vm(struct kvm *kvm);
 void kvm_xen_destroy_vm(struct kvm *kvm);
 
+int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e,
+			    struct kvm *kvm);
+int kvm_xen_setup_evtchn(struct kvm *kvm,
+			 struct kvm_kernel_irq_routing_entry *e,
+			 const struct kvm_irq_routing_entry *ue);
+
 static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
 {
 	return static_branch_unlikely(&kvm_xen_enabled.key) &&
@@ -134,6 +140,9 @@ struct compat_shared_info {
 	struct compat_arch_shared_info arch;
 };
 
+#define COMPAT_EVTCHN_2L_NR_CHANNELS (8 *				\
+				      sizeof_field(struct compat_shared_info, \
+						   evtchn_pending))
 struct compat_vcpu_runstate_info {
     int state;
     uint64_t state_entry_time;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9bbb1f1d9e48..3c47b146851a 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -497,6 +497,12 @@ struct kvm_hv_sint {
 	u32 sint;
 };
 
+struct kvm_xen_evtchn {
+	u32 port;
+	u32 vcpu;
+	u32 priority;
+};
+
 struct kvm_kernel_irq_routing_entry {
 	u32 gsi;
 	u32 type;
@@ -517,6 +523,7 @@ struct kvm_kernel_irq_routing_entry {
 		} msi;
 		struct kvm_s390_adapter_int adapter;
 		struct kvm_hv_sint hv_sint;
+		struct kvm_xen_evtchn xen_evtchn;
 	};
 	struct hlist_node link;
 };
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 469f05d69c8d..fbfd70d965c6 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1163,11 +1163,20 @@ struct kvm_irq_routing_hv_sint {
 	__u32 sint;
 };
 
+struct kvm_irq_routing_xen_evtchn {
+	__u32 port;
+	__u32 vcpu;
+	__u32 priority;
+};
+
+#define KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL ((__u32)(-1))
+
 /* gsi routing entry types */
 #define KVM_IRQ_ROUTING_IRQCHIP 1
 #define KVM_IRQ_ROUTING_MSI 2
 #define KVM_IRQ_ROUTING_S390_ADAPTER 3
 #define KVM_IRQ_ROUTING_HV_SINT 4
+#define KVM_IRQ_ROUTING_XEN_EVTCHN 5
 
 struct kvm_irq_routing_entry {
 	__u32 gsi;
@@ -1179,6 +1188,7 @@ struct kvm_irq_routing_entry {
 		struct kvm_irq_routing_msi msi;
 		struct kvm_irq_routing_s390_adapter adapter;
 		struct kvm_irq_routing_hv_sint hv_sint;
+		struct kvm_irq_routing_xen_evtchn xen_evtchn;
 		__u32 pad[8];
 	} u;
 };
@@ -1209,6 +1219,7 @@ struct kvm_x86_mce {
 #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL	(1 << 1)
 #define KVM_XEN_HVM_CONFIG_SHARED_INFO		(1 << 2)
 #define KVM_XEN_HVM_CONFIG_RUNSTATE		(1 << 3)
+#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL	(1 << 4)
 
 struct kvm_xen_hvm_config {
 	__u32 flags;
diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
index a0699f00b3d6..478e0ae8b93e 100644
--- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
+++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
@@ -14,6 +14,9 @@
 #include <stdint.h>
 #include <time.h>
 #include <sched.h>
+#include <signal.h>
+
+#include <sys/eventfd.h>
 
 #define VCPU_ID		5
 
@@ -22,10 +25,15 @@
 #define SHINFO_REGION_SLOT	10
 #define PAGE_SIZE		4096
 
+#define DUMMY_REGION_GPA	(SHINFO_REGION_GPA + (2 * PAGE_SIZE))
+#define DUMMY_REGION_SLOT	11
+
+#define SHINFO_ADDR	(SHINFO_REGION_GPA)
 #define PVTIME_ADDR	(SHINFO_REGION_GPA + PAGE_SIZE)
 #define RUNSTATE_ADDR	(SHINFO_REGION_GPA + PAGE_SIZE + 0x20)
 #define VCPU_INFO_ADDR	(SHINFO_REGION_GPA + 0x40)
 
+#define SHINFO_VADDR	(SHINFO_REGION_GVA)
 #define RUNSTATE_VADDR	(SHINFO_REGION_GVA + PAGE_SIZE + 0x20)
 #define VCPU_INFO_VADDR	(SHINFO_REGION_GVA + 0x40)
 
@@ -73,15 +81,37 @@ struct vcpu_info {
         struct pvclock_vcpu_time_info time;
 }; /* 64 bytes (x86) */
 
+struct shared_info {
+	struct vcpu_info vcpu_info[32];
+	unsigned long evtchn_pending[64];
+	unsigned long evtchn_mask[64];
+	struct pvclock_wall_clock wc;
+	uint32_t wc_sec_hi;
+	/* arch_shared_info here */
+};
+
 #define RUNSTATE_running  0
 #define RUNSTATE_runnable 1
 #define RUNSTATE_blocked  2
 #define RUNSTATE_offline  3
 
+static const char *runstate_names[] = {
+	"running",
+	"runnable",
+	"blocked",
+	"offline"
+};
+
+struct {
+	struct kvm_irq_routing info;
+	struct kvm_irq_routing_entry entries[2];
+} irq_routes;
+
 static void evtchn_handler(struct ex_regs *regs)
 {
 	struct vcpu_info *vi = (void *)VCPU_INFO_VADDR;
 	vi->evtchn_upcall_pending = 0;
+	vi->evtchn_pending_sel = 0;
 
 	GUEST_SYNC(0x20);
 }
@@ -127,7 +157,25 @@ static void guest_code(void)
 	GUEST_SYNC(6);
 	GUEST_ASSERT(rs->time[RUNSTATE_runnable] >= MIN_STEAL_TIME);
 
-	GUEST_DONE();
+	/* Attempt to deliver a *masked* interrupt */
+	GUEST_SYNC(7);
+
+	/* Wait until we see the bit set */
+	struct shared_info *si = (void *)SHINFO_VADDR;
+	while (!si->evtchn_pending[0])
+		__asm__ __volatile__ ("rep nop" : : : "memory");
+
+	/* Now deliver an *unmasked* interrupt */
+	GUEST_SYNC(8);
+
+	while (!si->evtchn_pending[1])
+		__asm__ __volatile__ ("rep nop" : : : "memory");
+
+	/* Change memslots and deliver an interrupt */
+	GUEST_SYNC(9);
+
+	for (;;)
+		__asm__ __volatile__ ("rep nop" : : : "memory");
 }
 
 static int cmp_timespec(struct timespec *a, struct timespec *b)
@@ -144,9 +192,18 @@ static int cmp_timespec(struct timespec *a, struct timespec *b)
 		return 0;
 }
 
+static void handle_alrm(int sig)
+{
+	TEST_FAIL("IRQ delivery timed out");
+}
+
 int main(int argc, char *argv[])
 {
 	struct timespec min_ts, max_ts, vm_ts;
+	bool verbose;
+
+	verbose = argc > 1 && (!strncmp(argv[1], "-v", 3) ||
+			       !strncmp(argv[1], "--verbose", 10));
 
 	int xen_caps = kvm_check_cap(KVM_CAP_XEN_HVM);
 	if (!(xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO) ) {
@@ -155,6 +212,7 @@ int main(int argc, char *argv[])
 	}
 
 	bool do_runstate_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE);
+	bool do_eventfd_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL);
 
 	clock_gettime(CLOCK_REALTIME, &min_ts);
 
@@ -166,6 +224,11 @@ int main(int argc, char *argv[])
 				    SHINFO_REGION_GPA, SHINFO_REGION_SLOT, 2, 0);
 	virt_map(vm, SHINFO_REGION_GVA, SHINFO_REGION_GPA, 2);
 
+	struct shared_info *shinfo = addr_gpa2hva(vm, SHINFO_VADDR);
+
+	int zero_fd = open("/dev/zero", O_RDONLY);
+	TEST_ASSERT(zero_fd != -1, "Failed to open /dev/zero");
+
 	struct kvm_xen_hvm_config hvmc = {
 		.flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
 		.msr = XEN_HYPERCALL_MSR,
@@ -184,6 +247,16 @@ int main(int argc, char *argv[])
 	};
 	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &ha);
 
+	/*
+	 * Test what happens when the HVA of the shinfo page is remapped after
+	 * the kernel has a reference to it. But make sure we copy the clock
+	 * info over since that's only set at setup time, and we test it later.
+	 */
+	struct pvclock_wall_clock wc_copy = shinfo->wc;
+	void *m = mmap(shinfo, PAGE_SIZE, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_PRIVATE, zero_fd, 0);
+	TEST_ASSERT(m == shinfo, "Failed to map /dev/zero over shared info");
+	shinfo->wc = wc_copy;
+
 	struct kvm_xen_vcpu_attr vi = {
 		.type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO,
 		.u.gpa = VCPU_INFO_ADDR,
@@ -214,6 +287,49 @@ int main(int argc, char *argv[])
 		vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &st);
 	}
 
+	int irq_fd[2] = { -1, -1 };
+
+	if (do_eventfd_tests) {
+		irq_fd[0] = eventfd(0, 0);
+		irq_fd[1] = eventfd(0, 0);
+
+		/* Unexpected, but not a KVM failure */
+		if (irq_fd[0] == -1 || irq_fd[1] == -1)
+			do_eventfd_tests = false;
+	}
+
+	if (do_eventfd_tests) {
+		irq_routes.info.nr = 2;
+
+		irq_routes.entries[0].gsi = 32;
+		irq_routes.entries[0].type = KVM_IRQ_ROUTING_XEN_EVTCHN;
+		irq_routes.entries[0].u.xen_evtchn.port = 15;
+		irq_routes.entries[0].u.xen_evtchn.vcpu = VCPU_ID;
+		irq_routes.entries[0].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+		irq_routes.entries[1].gsi = 33;
+		irq_routes.entries[1].type = KVM_IRQ_ROUTING_XEN_EVTCHN;
+		irq_routes.entries[1].u.xen_evtchn.port = 66;
+		irq_routes.entries[1].u.xen_evtchn.vcpu = VCPU_ID;
+		irq_routes.entries[1].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+		vm_ioctl(vm, KVM_SET_GSI_ROUTING, &irq_routes);
+
+		struct kvm_irqfd ifd = { };
+
+		ifd.fd = irq_fd[0];
+		ifd.gsi = 32;
+		vm_ioctl(vm, KVM_IRQFD, &ifd);
+
+		ifd.fd = irq_fd[1];
+		ifd.gsi = 33;
+		vm_ioctl(vm, KVM_IRQFD, &ifd);
+
+		struct sigaction sa = { };
+		sa.sa_handler = handle_alrm;
+		sigaction(SIGALRM, &sa, NULL);
+	}
+
 	struct vcpu_info *vinfo = addr_gpa2hva(vm, VCPU_INFO_VADDR);
 	vinfo->evtchn_upcall_pending = 0;
 
@@ -248,6 +364,8 @@ int main(int argc, char *argv[])
 
 			switch (uc.args[1]) {
 			case 0:
+				if (verbose)
+					printf("Delivering evtchn upcall\n");
 				evtchn_irq_expected = true;
 				vinfo->evtchn_upcall_pending = 1;
 				break;
@@ -256,11 +374,16 @@ int main(int argc, char *argv[])
 				TEST_ASSERT(!evtchn_irq_expected, "Event channel IRQ not seen");
 				if (!do_runstate_tests)
 					goto done;
+				if (verbose)
+					printf("Testing runstate %s\n", runstate_names[uc.args[1]]);
 				rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT;
 				rst.u.runstate.state = uc.args[1];
 				vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &rst);
 				break;
+
 			case 4:
+				if (verbose)
+					printf("Testing RUNSTATE_ADJUST\n");
 				rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST;
 				memset(&rst.u, 0, sizeof(rst.u));
 				rst.u.runstate.state = (uint64_t)-1;
@@ -274,6 +397,8 @@ int main(int argc, char *argv[])
 				break;
 
 			case 5:
+				if (verbose)
+					printf("Testing RUNSTATE_DATA\n");
 				rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA;
 				memset(&rst.u, 0, sizeof(rst.u));
 				rst.u.runstate.state = RUNSTATE_running;
@@ -282,16 +407,54 @@ int main(int argc, char *argv[])
 				rst.u.runstate.time_offline = 0x5a;
 				vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &rst);
 				break;
+
 			case 6:
+				if (verbose)
+					printf("Testing steal time\n");
 				/* Yield until scheduler delay exceeds target */
 				rundelay = get_run_delay() + MIN_STEAL_TIME;
 				do {
 					sched_yield();
 				} while (get_run_delay() < rundelay);
 				break;
+
+			case 7:
+				if (!do_eventfd_tests)
+					goto done;
+				if (verbose)
+					printf("Testing masked event channel\n");
+				shinfo->evtchn_mask[0] = 0x8000;
+				eventfd_write(irq_fd[0], 1UL);
+				alarm(1);
+				break;
+
+			case 8:
+				if (verbose)
+					printf("Testing unmasked event channel\n");
+				/* Unmask that, but deliver the other one */
+				shinfo->evtchn_pending[0] = 0;
+				shinfo->evtchn_mask[0] = 0;
+				eventfd_write(irq_fd[1], 1UL);
+				evtchn_irq_expected = true;
+				alarm(1);
+				break;
+
+			case 9:
+				if (verbose)
+					printf("Testing event channel after memslot change\n");
+				vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+							    DUMMY_REGION_GPA, DUMMY_REGION_SLOT, 1, 0);
+				eventfd_write(irq_fd[0], 1UL);
+				evtchn_irq_expected = true;
+				alarm(1);
+				break;
+
 			case 0x20:
 				TEST_ASSERT(evtchn_irq_expected, "Unexpected event channel IRQ");
 				evtchn_irq_expected = false;
+				if (shinfo->evtchn_pending[1] &&
+				    shinfo->evtchn_pending[0])
+					goto done;
 				break;
 			}
 			break;
@@ -318,6 +481,16 @@ int main(int argc, char *argv[])
 	ti = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0x40 + 0x20);
 	ti2 = addr_gpa2hva(vm, PVTIME_ADDR);
 
+	if (verbose) {
+		printf("Wall clock (v %d) %d.%09d\n", wc->version, wc->sec, wc->nsec);
+		printf("Time info 1: v %u tsc %" PRIu64 " time %" PRIu64 " mul %u shift %u flags %x\n",
+		       ti->version, ti->tsc_timestamp, ti->system_time, ti->tsc_to_system_mul,
+		       ti->tsc_shift, ti->flags);
+		printf("Time info 2: v %u tsc %" PRIu64 " time %" PRIu64 " mul %u shift %u flags %x\n",
+		       ti2->version, ti2->tsc_timestamp, ti2->system_time, ti2->tsc_to_system_mul,
+		       ti2->tsc_shift, ti2->flags);
+	}
+
 	vm_ts.tv_sec = wc->sec;
 	vm_ts.tv_nsec = wc->nsec;
         TEST_ASSERT(wc->version && !(wc->version & 1),
@@ -341,6 +514,15 @@ int main(int argc, char *argv[])
 		};
 		vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_GET_ATTR, &rst);
 
+		if (verbose) {
+			printf("Runstate: %s(%d), entry %" PRIu64 " ns\n",
+			       rs->state <= RUNSTATE_offline ? runstate_names[rs->state] : "unknown",
+			       rs->state, rs->state_entry_time);
+			for (int i = RUNSTATE_running; i <= RUNSTATE_offline; i++) {
+				printf("State %s: %" PRIu64 " ns\n",
+				       runstate_names[i], rs->time[i]);
+			}
+		}
 		TEST_ASSERT(rs->state == rst.u.runstate.state, "Runstate mismatch");
 		TEST_ASSERT(rs->state_entry_time == rst.u.runstate.state_entry_time,
 			    "State entry time mismatch");
-- 
cgit v1.2.3


From 7d9a662ed9f0403e7b94940dceb81552b8edb931 Mon Sep 17 00:00:00 2001
From: Michael Roth <michael.roth@amd.com>
Date: Fri, 10 Dec 2021 10:46:11 -0600
Subject: kvm: selftests: move base kvm_util.h declarations to kvm_util_base.h

Between helper macros and interfaces that will be introduced in
subsequent patches, much of kvm_util.h would end up being declarations
specific to ucall. Ideally these could be separated out into a separate
header since they are not strictly required for writing guest tests and
are mostly self-contained interfaces other than a reliance on a few
core declarations like struct kvm_vm. This doesn't make a big
difference as far as how tests will be compiled/written since all these
interfaces will still be packaged up into a single/common libkvm.a used
by all tests, but it is still nice to be able to compartmentalize to
improve readabilty and reduce merge conflicts in the future for common
tasks like adding new interfaces to kvm_util.h.

Furthermore, some of the ucall declarations will be arch-specific,
requiring various #ifdef'ery in kvm_util.h. Ideally these declarations
could live in separate arch-specific headers, e.g.
include/<arch>/ucall.h, which would handle arch-specific declarations
as well as pulling in common ucall-related declarations shared by all
archs.

One simple way to do this would be to #include ucall.h at the bottom of
kvm_util.h, after declarations it relies upon like struct kvm_vm.
This is brittle however, and doesn't scale easily to other sets of
interfaces that may be added in the future.

Instead, move all declarations currently in kvm_util.h into
kvm_util_base.h, then have kvm_util.h #include it. With this change,
non-base declarations can be selectively moved/introduced into separate
headers, which can then be included in kvm_util.h so that individual
tests don't need to be touched. Subsequent patches will then move
ucall-related declarations into a separate header to meet the above
goals.

Signed-off-by: Michael Roth <michael.roth@amd.com>
Message-Id: <20211210164620.11636-2-michael.roth@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/include/kvm_util.h     | 436 +-------------------
 .../testing/selftests/kvm/include/kvm_util_base.h  | 446 +++++++++++++++++++++
 2 files changed, 447 insertions(+), 435 deletions(-)
 create mode 100644 tools/testing/selftests/kvm/include/kvm_util_base.h

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 4ba3f47a41d2..c860ced3888d 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -7,440 +7,6 @@
 #ifndef SELFTEST_KVM_UTIL_H
 #define SELFTEST_KVM_UTIL_H
 
-#include "test_util.h"
-
-#include "asm/kvm.h"
-#include "linux/list.h"
-#include "linux/kvm.h"
-#include <sys/ioctl.h>
-
-#include "sparsebit.h"
-
-#define KVM_DEV_PATH "/dev/kvm"
-#define KVM_MAX_VCPUS 512
-
-#define NSEC_PER_SEC 1000000000L
-
-/*
- * Callers of kvm_util only have an incomplete/opaque description of the
- * structure kvm_util is using to maintain the state of a VM.
- */
-struct kvm_vm;
-
-typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */
-typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */
-
-/* Minimum allocated guest virtual and physical addresses */
-#define KVM_UTIL_MIN_VADDR		0x2000
-#define KVM_GUEST_PAGE_TABLE_MIN_PADDR	0x180000
-
-#define DEFAULT_GUEST_PHY_PAGES		512
-#define DEFAULT_GUEST_STACK_VADDR_MIN	0xab6000
-#define DEFAULT_STACK_PGS		5
-
-enum vm_guest_mode {
-	VM_MODE_P52V48_4K,
-	VM_MODE_P52V48_64K,
-	VM_MODE_P48V48_4K,
-	VM_MODE_P48V48_16K,
-	VM_MODE_P48V48_64K,
-	VM_MODE_P40V48_4K,
-	VM_MODE_P40V48_16K,
-	VM_MODE_P40V48_64K,
-	VM_MODE_PXXV48_4K,	/* For 48bits VA but ANY bits PA */
-	VM_MODE_P47V64_4K,
-	VM_MODE_P44V64_4K,
-	VM_MODE_P36V48_4K,
-	VM_MODE_P36V48_16K,
-	VM_MODE_P36V48_64K,
-	VM_MODE_P36V47_16K,
-	NUM_VM_MODES,
-};
-
-#if defined(__aarch64__)
-
-extern enum vm_guest_mode vm_mode_default;
-
-#define VM_MODE_DEFAULT			vm_mode_default
-#define MIN_PAGE_SHIFT			12U
-#define ptes_per_page(page_size)	((page_size) / 8)
-
-#elif defined(__x86_64__)
-
-#define VM_MODE_DEFAULT			VM_MODE_PXXV48_4K
-#define MIN_PAGE_SHIFT			12U
-#define ptes_per_page(page_size)	((page_size) / 8)
-
-#elif defined(__s390x__)
-
-#define VM_MODE_DEFAULT			VM_MODE_P44V64_4K
-#define MIN_PAGE_SHIFT			12U
-#define ptes_per_page(page_size)	((page_size) / 16)
-
-#elif defined(__riscv)
-
-#if __riscv_xlen == 32
-#error "RISC-V 32-bit kvm selftests not supported"
-#endif
-
-#define VM_MODE_DEFAULT			VM_MODE_P40V48_4K
-#define MIN_PAGE_SHIFT			12U
-#define ptes_per_page(page_size)	((page_size) / 8)
-
-#endif
-
-#define MIN_PAGE_SIZE		(1U << MIN_PAGE_SHIFT)
-#define PTES_PER_MIN_PAGE	ptes_per_page(MIN_PAGE_SIZE)
-
-struct vm_guest_mode_params {
-	unsigned int pa_bits;
-	unsigned int va_bits;
-	unsigned int page_size;
-	unsigned int page_shift;
-};
-extern const struct vm_guest_mode_params vm_guest_mode_params[];
-
-int open_path_or_exit(const char *path, int flags);
-int open_kvm_dev_path_or_exit(void);
-int kvm_check_cap(long cap);
-int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap);
-int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id,
-		    struct kvm_enable_cap *cap);
-void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size);
-const char *vm_guest_mode_string(uint32_t i);
-
-struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm);
-void kvm_vm_free(struct kvm_vm *vmp);
-void kvm_vm_restart(struct kvm_vm *vmp, int perm);
-void kvm_vm_release(struct kvm_vm *vmp);
-void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log);
-void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log,
-			    uint64_t first_page, uint32_t num_pages);
-uint32_t kvm_vm_reset_dirty_ring(struct kvm_vm *vm);
-
-int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva,
-		       size_t len);
-
-void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename);
-
-void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent);
-
-/*
- * VM VCPU Dump
- *
- * Input Args:
- *   stream - Output FILE stream
- *   vm     - Virtual Machine
- *   vcpuid - VCPU ID
- *   indent - Left margin indent amount
- *
- * Output Args: None
- *
- * Return: None
- *
- * Dumps the current state of the VCPU specified by @vcpuid, within the VM
- * given by @vm, to the FILE stream given by @stream.
- */
-void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid,
-	       uint8_t indent);
-
-void vm_create_irqchip(struct kvm_vm *vm);
-
-void vm_userspace_mem_region_add(struct kvm_vm *vm,
-	enum vm_mem_backing_src_type src_type,
-	uint64_t guest_paddr, uint32_t slot, uint64_t npages,
-	uint32_t flags);
-
-void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl,
-		void *arg);
-int _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl,
-		void *arg);
-void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg);
-int _vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg);
-void kvm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg);
-int _kvm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg);
-void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags);
-void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa);
-void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot);
-void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid);
-vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min);
-vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages);
-vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm);
-
-void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
-	      unsigned int npages);
-void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa);
-void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva);
-vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva);
-void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa);
-
-/*
- * Address Guest Virtual to Guest Physical
- *
- * Input Args:
- *   vm - Virtual Machine
- *   gva - VM virtual address
- *
- * Output Args: None
- *
- * Return:
- *   Equivalent VM physical address
- *
- * Returns the VM physical address of the translated VM virtual
- * address given by @gva.
- */
-vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva);
-
-struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid);
-void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid);
-int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid);
-int vcpu_get_fd(struct kvm_vm *vm, uint32_t vcpuid);
-void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid);
-void vcpu_set_guest_debug(struct kvm_vm *vm, uint32_t vcpuid,
-			  struct kvm_guest_debug *debug);
-void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid,
-		       struct kvm_mp_state *mp_state);
-struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vm *vm, uint32_t vcpuid);
-void vcpu_regs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs);
-void vcpu_regs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs);
-
-/*
- * VM VCPU Args Set
- *
- * Input Args:
- *   vm - Virtual Machine
- *   vcpuid - VCPU ID
- *   num - number of arguments
- *   ... - arguments, each of type uint64_t
- *
- * Output Args: None
- *
- * Return: None
- *
- * Sets the first @num function input registers of the VCPU with @vcpuid,
- * per the C calling convention of the architecture, to the values given
- * as variable args. Each of the variable args is expected to be of type
- * uint64_t. The maximum @num can be is specific to the architecture.
- */
-void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...);
-
-void vcpu_sregs_get(struct kvm_vm *vm, uint32_t vcpuid,
-		    struct kvm_sregs *sregs);
-void vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid,
-		    struct kvm_sregs *sregs);
-int _vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid,
-		    struct kvm_sregs *sregs);
-void vcpu_fpu_get(struct kvm_vm *vm, uint32_t vcpuid,
-		  struct kvm_fpu *fpu);
-void vcpu_fpu_set(struct kvm_vm *vm, uint32_t vcpuid,
-		  struct kvm_fpu *fpu);
-void vcpu_get_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg);
-void vcpu_set_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg);
-#ifdef __KVM_HAVE_VCPU_EVENTS
-void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid,
-		     struct kvm_vcpu_events *events);
-void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid,
-		     struct kvm_vcpu_events *events);
-#endif
-#ifdef __x86_64__
-void vcpu_nested_state_get(struct kvm_vm *vm, uint32_t vcpuid,
-			   struct kvm_nested_state *state);
-int vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid,
-			  struct kvm_nested_state *state, bool ignore_error);
-#endif
-void *vcpu_map_dirty_ring(struct kvm_vm *vm, uint32_t vcpuid);
-
-int _kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr);
-int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr);
-int _kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test, int *fd);
-int kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test);
-int _kvm_device_access(int dev_fd, uint32_t group, uint64_t attr,
-		       void *val, bool write);
-int kvm_device_access(int dev_fd, uint32_t group, uint64_t attr,
-		      void *val, bool write);
-void kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level);
-int _kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level);
-
-int _vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group,
-			  uint64_t attr);
-int vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group,
-			 uint64_t attr);
-int _vcpu_access_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group,
-			  uint64_t attr, void *val, bool write);
-int vcpu_access_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group,
-			 uint64_t attr, void *val, bool write);
-
-#define KVM_MAX_IRQ_ROUTES		4096
-
-struct kvm_irq_routing *kvm_gsi_routing_create(void);
-void kvm_gsi_routing_irqchip_add(struct kvm_irq_routing *routing,
-		uint32_t gsi, uint32_t pin);
-int _kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing);
-void kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing);
-
-const char *exit_reason_str(unsigned int exit_reason);
-
-void virt_pgd_alloc(struct kvm_vm *vm);
-
-/*
- * VM Virtual Page Map
- *
- * Input Args:
- *   vm - Virtual Machine
- *   vaddr - VM Virtual Address
- *   paddr - VM Physical Address
- *   memslot - Memory region slot for new virtual translation tables
- *
- * Output Args: None
- *
- * Return: None
- *
- * Within @vm, creates a virtual translation for the page starting
- * at @vaddr to the page starting at @paddr.
- */
-void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr);
-
-vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min,
-			     uint32_t memslot);
-vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num,
-			      vm_paddr_t paddr_min, uint32_t memslot);
-vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm);
-
-/*
- * Create a VM with reasonable defaults
- *
- * Input Args:
- *   vcpuid - The id of the single VCPU to add to the VM.
- *   extra_mem_pages - The number of extra pages to add (this will
- *                     decide how much extra space we will need to
- *                     setup the page tables using memslot 0)
- *   guest_code - The vCPU's entry point
- *
- * Output Args: None
- *
- * Return:
- *   Pointer to opaque structure that describes the created VM.
- */
-struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages,
-				 void *guest_code);
-
-/* Same as vm_create_default, but can be used for more than one vcpu */
-struct kvm_vm *vm_create_default_with_vcpus(uint32_t nr_vcpus, uint64_t extra_mem_pages,
-					    uint32_t num_percpu_pages, void *guest_code,
-					    uint32_t vcpuids[]);
-
-/* Like vm_create_default_with_vcpus, but accepts mode and slot0 memory as a parameter */
-struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus,
-				    uint64_t slot0_mem_pages, uint64_t extra_mem_pages,
-				    uint32_t num_percpu_pages, void *guest_code,
-				    uint32_t vcpuids[]);
-
-/*
- * Adds a vCPU with reasonable defaults (e.g. a stack)
- *
- * Input Args:
- *   vm - Virtual Machine
- *   vcpuid - The id of the VCPU to add to the VM.
- *   guest_code - The vCPU's entry point
- */
-void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code);
-
-bool vm_is_unrestricted_guest(struct kvm_vm *vm);
-
-unsigned int vm_get_page_size(struct kvm_vm *vm);
-unsigned int vm_get_page_shift(struct kvm_vm *vm);
-unsigned long vm_compute_max_gfn(struct kvm_vm *vm);
-uint64_t vm_get_max_gfn(struct kvm_vm *vm);
-int vm_get_fd(struct kvm_vm *vm);
-
-unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size);
-unsigned int vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages);
-unsigned int vm_num_guest_pages(enum vm_guest_mode mode, unsigned int num_host_pages);
-static inline unsigned int
-vm_adjust_num_guest_pages(enum vm_guest_mode mode, unsigned int num_guest_pages)
-{
-	unsigned int n;
-	n = vm_num_guest_pages(mode, vm_num_host_pages(mode, num_guest_pages));
-#ifdef __s390x__
-	/* s390 requires 1M aligned guest sizes */
-	n = (n + 255) & ~255;
-#endif
-	return n;
-}
-
-struct kvm_userspace_memory_region *
-kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start,
-				 uint64_t end);
-
-struct kvm_dirty_log *
-allocate_kvm_dirty_log(struct kvm_userspace_memory_region *region);
-
-int vm_create_device(struct kvm_vm *vm, struct kvm_create_device *cd);
-
-#define sync_global_to_guest(vm, g) ({				\
-	typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g));	\
-	memcpy(_p, &(g), sizeof(g));				\
-})
-
-#define sync_global_from_guest(vm, g) ({			\
-	typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g));	\
-	memcpy(&(g), _p, sizeof(g));				\
-})
-
-void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid);
-
-/* Common ucalls */
-enum {
-	UCALL_NONE,
-	UCALL_SYNC,
-	UCALL_ABORT,
-	UCALL_DONE,
-	UCALL_UNHANDLED,
-};
-
-#define UCALL_MAX_ARGS 6
-
-struct ucall {
-	uint64_t cmd;
-	uint64_t args[UCALL_MAX_ARGS];
-};
-
-void ucall_init(struct kvm_vm *vm, void *arg);
-void ucall_uninit(struct kvm_vm *vm);
-void ucall(uint64_t cmd, int nargs, ...);
-uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc);
-
-#define GUEST_SYNC_ARGS(stage, arg1, arg2, arg3, arg4)	\
-				ucall(UCALL_SYNC, 6, "hello", stage, arg1, arg2, arg3, arg4)
-#define GUEST_SYNC(stage)	ucall(UCALL_SYNC, 2, "hello", stage)
-#define GUEST_DONE()		ucall(UCALL_DONE, 0)
-#define __GUEST_ASSERT(_condition, _condstr, _nargs, _args...) do {    \
-	if (!(_condition))                                              \
-		ucall(UCALL_ABORT, 2 + _nargs,                          \
-			"Failed guest assert: "                         \
-			_condstr, __LINE__, _args);                     \
-} while (0)
-
-#define GUEST_ASSERT(_condition) \
-	__GUEST_ASSERT(_condition, #_condition, 0, 0)
-
-#define GUEST_ASSERT_1(_condition, arg1) \
-	__GUEST_ASSERT(_condition, #_condition, 1, (arg1))
-
-#define GUEST_ASSERT_2(_condition, arg1, arg2) \
-	__GUEST_ASSERT(_condition, #_condition, 2, (arg1), (arg2))
-
-#define GUEST_ASSERT_3(_condition, arg1, arg2, arg3) \
-	__GUEST_ASSERT(_condition, #_condition, 3, (arg1), (arg2), (arg3))
-
-#define GUEST_ASSERT_4(_condition, arg1, arg2, arg3, arg4) \
-	__GUEST_ASSERT(_condition, #_condition, 4, (arg1), (arg2), (arg3), (arg4))
-
-#define GUEST_ASSERT_EQ(a, b) __GUEST_ASSERT((a) == (b), #a " == " #b, 2, a, b)
-
-int vm_get_stats_fd(struct kvm_vm *vm);
-int vcpu_get_stats_fd(struct kvm_vm *vm, uint32_t vcpuid);
-
-uint32_t guest_get_vcpuid(void);
+#include "kvm_util_base.h"
 
 #endif /* SELFTEST_KVM_UTIL_H */
diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h
new file mode 100644
index 000000000000..749da2c4bab0
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/kvm_util_base.h
@@ -0,0 +1,446 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * tools/testing/selftests/kvm/include/kvm_util_base.h
+ *
+ * Copyright (C) 2018, Google LLC.
+ */
+#ifndef SELFTEST_KVM_UTIL_BASE_H
+#define SELFTEST_KVM_UTIL_BASE_H
+
+#include "test_util.h"
+
+#include "asm/kvm.h"
+#include "linux/list.h"
+#include "linux/kvm.h"
+#include <sys/ioctl.h>
+
+#include "sparsebit.h"
+
+#define KVM_DEV_PATH "/dev/kvm"
+#define KVM_MAX_VCPUS 512
+
+#define NSEC_PER_SEC 1000000000L
+
+/*
+ * Callers of kvm_util only have an incomplete/opaque description of the
+ * structure kvm_util is using to maintain the state of a VM.
+ */
+struct kvm_vm;
+
+typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */
+typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */
+
+/* Minimum allocated guest virtual and physical addresses */
+#define KVM_UTIL_MIN_VADDR		0x2000
+#define KVM_GUEST_PAGE_TABLE_MIN_PADDR	0x180000
+
+#define DEFAULT_GUEST_PHY_PAGES		512
+#define DEFAULT_GUEST_STACK_VADDR_MIN	0xab6000
+#define DEFAULT_STACK_PGS		5
+
+enum vm_guest_mode {
+	VM_MODE_P52V48_4K,
+	VM_MODE_P52V48_64K,
+	VM_MODE_P48V48_4K,
+	VM_MODE_P48V48_16K,
+	VM_MODE_P48V48_64K,
+	VM_MODE_P40V48_4K,
+	VM_MODE_P40V48_16K,
+	VM_MODE_P40V48_64K,
+	VM_MODE_PXXV48_4K,	/* For 48bits VA but ANY bits PA */
+	VM_MODE_P47V64_4K,
+	VM_MODE_P44V64_4K,
+	VM_MODE_P36V48_4K,
+	VM_MODE_P36V48_16K,
+	VM_MODE_P36V48_64K,
+	VM_MODE_P36V47_16K,
+	NUM_VM_MODES,
+};
+
+#if defined(__aarch64__)
+
+extern enum vm_guest_mode vm_mode_default;
+
+#define VM_MODE_DEFAULT			vm_mode_default
+#define MIN_PAGE_SHIFT			12U
+#define ptes_per_page(page_size)	((page_size) / 8)
+
+#elif defined(__x86_64__)
+
+#define VM_MODE_DEFAULT			VM_MODE_PXXV48_4K
+#define MIN_PAGE_SHIFT			12U
+#define ptes_per_page(page_size)	((page_size) / 8)
+
+#elif defined(__s390x__)
+
+#define VM_MODE_DEFAULT			VM_MODE_P44V64_4K
+#define MIN_PAGE_SHIFT			12U
+#define ptes_per_page(page_size)	((page_size) / 16)
+
+#elif defined(__riscv)
+
+#if __riscv_xlen == 32
+#error "RISC-V 32-bit kvm selftests not supported"
+#endif
+
+#define VM_MODE_DEFAULT			VM_MODE_P40V48_4K
+#define MIN_PAGE_SHIFT			12U
+#define ptes_per_page(page_size)	((page_size) / 8)
+
+#endif
+
+#define MIN_PAGE_SIZE		(1U << MIN_PAGE_SHIFT)
+#define PTES_PER_MIN_PAGE	ptes_per_page(MIN_PAGE_SIZE)
+
+struct vm_guest_mode_params {
+	unsigned int pa_bits;
+	unsigned int va_bits;
+	unsigned int page_size;
+	unsigned int page_shift;
+};
+extern const struct vm_guest_mode_params vm_guest_mode_params[];
+
+int open_path_or_exit(const char *path, int flags);
+int open_kvm_dev_path_or_exit(void);
+int kvm_check_cap(long cap);
+int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap);
+int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id,
+		    struct kvm_enable_cap *cap);
+void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size);
+const char *vm_guest_mode_string(uint32_t i);
+
+struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm);
+void kvm_vm_free(struct kvm_vm *vmp);
+void kvm_vm_restart(struct kvm_vm *vmp, int perm);
+void kvm_vm_release(struct kvm_vm *vmp);
+void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log);
+void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log,
+			    uint64_t first_page, uint32_t num_pages);
+uint32_t kvm_vm_reset_dirty_ring(struct kvm_vm *vm);
+
+int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva,
+		       size_t len);
+
+void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename);
+
+void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent);
+
+/*
+ * VM VCPU Dump
+ *
+ * Input Args:
+ *   stream - Output FILE stream
+ *   vm     - Virtual Machine
+ *   vcpuid - VCPU ID
+ *   indent - Left margin indent amount
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Dumps the current state of the VCPU specified by @vcpuid, within the VM
+ * given by @vm, to the FILE stream given by @stream.
+ */
+void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid,
+	       uint8_t indent);
+
+void vm_create_irqchip(struct kvm_vm *vm);
+
+void vm_userspace_mem_region_add(struct kvm_vm *vm,
+	enum vm_mem_backing_src_type src_type,
+	uint64_t guest_paddr, uint32_t slot, uint64_t npages,
+	uint32_t flags);
+
+void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl,
+		void *arg);
+int _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl,
+		void *arg);
+void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg);
+int _vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg);
+void kvm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg);
+int _kvm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg);
+void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags);
+void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa);
+void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot);
+void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid);
+vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min);
+vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages);
+vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm);
+
+void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
+	      unsigned int npages);
+void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa);
+void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva);
+vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva);
+void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa);
+
+/*
+ * Address Guest Virtual to Guest Physical
+ *
+ * Input Args:
+ *   vm - Virtual Machine
+ *   gva - VM virtual address
+ *
+ * Output Args: None
+ *
+ * Return:
+ *   Equivalent VM physical address
+ *
+ * Returns the VM physical address of the translated VM virtual
+ * address given by @gva.
+ */
+vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva);
+
+struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid);
+void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid);
+int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid);
+int vcpu_get_fd(struct kvm_vm *vm, uint32_t vcpuid);
+void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid);
+void vcpu_set_guest_debug(struct kvm_vm *vm, uint32_t vcpuid,
+			  struct kvm_guest_debug *debug);
+void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid,
+		       struct kvm_mp_state *mp_state);
+struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vm *vm, uint32_t vcpuid);
+void vcpu_regs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs);
+void vcpu_regs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs);
+
+/*
+ * VM VCPU Args Set
+ *
+ * Input Args:
+ *   vm - Virtual Machine
+ *   vcpuid - VCPU ID
+ *   num - number of arguments
+ *   ... - arguments, each of type uint64_t
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Sets the first @num function input registers of the VCPU with @vcpuid,
+ * per the C calling convention of the architecture, to the values given
+ * as variable args. Each of the variable args is expected to be of type
+ * uint64_t. The maximum @num can be is specific to the architecture.
+ */
+void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...);
+
+void vcpu_sregs_get(struct kvm_vm *vm, uint32_t vcpuid,
+		    struct kvm_sregs *sregs);
+void vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid,
+		    struct kvm_sregs *sregs);
+int _vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid,
+		    struct kvm_sregs *sregs);
+void vcpu_fpu_get(struct kvm_vm *vm, uint32_t vcpuid,
+		  struct kvm_fpu *fpu);
+void vcpu_fpu_set(struct kvm_vm *vm, uint32_t vcpuid,
+		  struct kvm_fpu *fpu);
+void vcpu_get_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg);
+void vcpu_set_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg);
+#ifdef __KVM_HAVE_VCPU_EVENTS
+void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid,
+		     struct kvm_vcpu_events *events);
+void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid,
+		     struct kvm_vcpu_events *events);
+#endif
+#ifdef __x86_64__
+void vcpu_nested_state_get(struct kvm_vm *vm, uint32_t vcpuid,
+			   struct kvm_nested_state *state);
+int vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid,
+			  struct kvm_nested_state *state, bool ignore_error);
+#endif
+void *vcpu_map_dirty_ring(struct kvm_vm *vm, uint32_t vcpuid);
+
+int _kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr);
+int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr);
+int _kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test, int *fd);
+int kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test);
+int _kvm_device_access(int dev_fd, uint32_t group, uint64_t attr,
+		       void *val, bool write);
+int kvm_device_access(int dev_fd, uint32_t group, uint64_t attr,
+		      void *val, bool write);
+void kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level);
+int _kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level);
+
+int _vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group,
+			  uint64_t attr);
+int vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group,
+			 uint64_t attr);
+int _vcpu_access_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group,
+			  uint64_t attr, void *val, bool write);
+int vcpu_access_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group,
+			 uint64_t attr, void *val, bool write);
+
+#define KVM_MAX_IRQ_ROUTES		4096
+
+struct kvm_irq_routing *kvm_gsi_routing_create(void);
+void kvm_gsi_routing_irqchip_add(struct kvm_irq_routing *routing,
+		uint32_t gsi, uint32_t pin);
+int _kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing);
+void kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing);
+
+const char *exit_reason_str(unsigned int exit_reason);
+
+void virt_pgd_alloc(struct kvm_vm *vm);
+
+/*
+ * VM Virtual Page Map
+ *
+ * Input Args:
+ *   vm - Virtual Machine
+ *   vaddr - VM Virtual Address
+ *   paddr - VM Physical Address
+ *   memslot - Memory region slot for new virtual translation tables
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Within @vm, creates a virtual translation for the page starting
+ * at @vaddr to the page starting at @paddr.
+ */
+void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr);
+
+vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min,
+			     uint32_t memslot);
+vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num,
+			      vm_paddr_t paddr_min, uint32_t memslot);
+vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm);
+
+/*
+ * Create a VM with reasonable defaults
+ *
+ * Input Args:
+ *   vcpuid - The id of the single VCPU to add to the VM.
+ *   extra_mem_pages - The number of extra pages to add (this will
+ *                     decide how much extra space we will need to
+ *                     setup the page tables using memslot 0)
+ *   guest_code - The vCPU's entry point
+ *
+ * Output Args: None
+ *
+ * Return:
+ *   Pointer to opaque structure that describes the created VM.
+ */
+struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages,
+				 void *guest_code);
+
+/* Same as vm_create_default, but can be used for more than one vcpu */
+struct kvm_vm *vm_create_default_with_vcpus(uint32_t nr_vcpus, uint64_t extra_mem_pages,
+					    uint32_t num_percpu_pages, void *guest_code,
+					    uint32_t vcpuids[]);
+
+/* Like vm_create_default_with_vcpus, but accepts mode and slot0 memory as a parameter */
+struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus,
+				    uint64_t slot0_mem_pages, uint64_t extra_mem_pages,
+				    uint32_t num_percpu_pages, void *guest_code,
+				    uint32_t vcpuids[]);
+
+/*
+ * Adds a vCPU with reasonable defaults (e.g. a stack)
+ *
+ * Input Args:
+ *   vm - Virtual Machine
+ *   vcpuid - The id of the VCPU to add to the VM.
+ *   guest_code - The vCPU's entry point
+ */
+void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code);
+
+bool vm_is_unrestricted_guest(struct kvm_vm *vm);
+
+unsigned int vm_get_page_size(struct kvm_vm *vm);
+unsigned int vm_get_page_shift(struct kvm_vm *vm);
+unsigned long vm_compute_max_gfn(struct kvm_vm *vm);
+uint64_t vm_get_max_gfn(struct kvm_vm *vm);
+int vm_get_fd(struct kvm_vm *vm);
+
+unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size);
+unsigned int vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages);
+unsigned int vm_num_guest_pages(enum vm_guest_mode mode, unsigned int num_host_pages);
+static inline unsigned int
+vm_adjust_num_guest_pages(enum vm_guest_mode mode, unsigned int num_guest_pages)
+{
+	unsigned int n;
+	n = vm_num_guest_pages(mode, vm_num_host_pages(mode, num_guest_pages));
+#ifdef __s390x__
+	/* s390 requires 1M aligned guest sizes */
+	n = (n + 255) & ~255;
+#endif
+	return n;
+}
+
+struct kvm_userspace_memory_region *
+kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start,
+				 uint64_t end);
+
+struct kvm_dirty_log *
+allocate_kvm_dirty_log(struct kvm_userspace_memory_region *region);
+
+int vm_create_device(struct kvm_vm *vm, struct kvm_create_device *cd);
+
+#define sync_global_to_guest(vm, g) ({				\
+	typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g));	\
+	memcpy(_p, &(g), sizeof(g));				\
+})
+
+#define sync_global_from_guest(vm, g) ({			\
+	typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g));	\
+	memcpy(&(g), _p, sizeof(g));				\
+})
+
+void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid);
+
+/* Common ucalls */
+enum {
+	UCALL_NONE,
+	UCALL_SYNC,
+	UCALL_ABORT,
+	UCALL_DONE,
+	UCALL_UNHANDLED,
+};
+
+#define UCALL_MAX_ARGS 6
+
+struct ucall {
+	uint64_t cmd;
+	uint64_t args[UCALL_MAX_ARGS];
+};
+
+void ucall_init(struct kvm_vm *vm, void *arg);
+void ucall_uninit(struct kvm_vm *vm);
+void ucall(uint64_t cmd, int nargs, ...);
+uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc);
+
+#define GUEST_SYNC_ARGS(stage, arg1, arg2, arg3, arg4)	\
+				ucall(UCALL_SYNC, 6, "hello", stage, arg1, arg2, arg3, arg4)
+#define GUEST_SYNC(stage)	ucall(UCALL_SYNC, 2, "hello", stage)
+#define GUEST_DONE()		ucall(UCALL_DONE, 0)
+#define __GUEST_ASSERT(_condition, _condstr, _nargs, _args...) do {    \
+	if (!(_condition))                                              \
+		ucall(UCALL_ABORT, 2 + _nargs,                          \
+			"Failed guest assert: "                         \
+			_condstr, __LINE__, _args);                     \
+} while (0)
+
+#define GUEST_ASSERT(_condition) \
+	__GUEST_ASSERT(_condition, #_condition, 0, 0)
+
+#define GUEST_ASSERT_1(_condition, arg1) \
+	__GUEST_ASSERT(_condition, #_condition, 1, (arg1))
+
+#define GUEST_ASSERT_2(_condition, arg1, arg2) \
+	__GUEST_ASSERT(_condition, #_condition, 2, (arg1), (arg2))
+
+#define GUEST_ASSERT_3(_condition, arg1, arg2, arg3) \
+	__GUEST_ASSERT(_condition, #_condition, 3, (arg1), (arg2), (arg3))
+
+#define GUEST_ASSERT_4(_condition, arg1, arg2, arg3, arg4) \
+	__GUEST_ASSERT(_condition, #_condition, 4, (arg1), (arg2), (arg3), (arg4))
+
+#define GUEST_ASSERT_EQ(a, b) __GUEST_ASSERT((a) == (b), #a " == " #b, 2, a, b)
+
+int vm_get_stats_fd(struct kvm_vm *vm);
+int vcpu_get_stats_fd(struct kvm_vm *vm, uint32_t vcpuid);
+
+uint32_t guest_get_vcpuid(void);
+
+#endif /* SELFTEST_KVM_UTIL_BASE_H */
-- 
cgit v1.2.3


From 96c1a6285568d31a8125c36974a140fccbe548af Mon Sep 17 00:00:00 2001
From: Michael Roth <michael.roth@amd.com>
Date: Fri, 10 Dec 2021 10:46:12 -0600
Subject: kvm: selftests: move ucall declarations into ucall_common.h

Now that core kvm_util declarations have special home in
kvm_util_base.h, move ucall-related declarations out into a separate
header.

Signed-off-by: Michael Roth <michael.roth@amd.com>
Message-Id: <20211210164620.11636-3-michael.roth@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/include/kvm_util.h     |  1 +
 .../testing/selftests/kvm/include/kvm_util_base.h  | 49 ------------------
 tools/testing/selftests/kvm/include/ucall_common.h | 59 ++++++++++++++++++++++
 3 files changed, 60 insertions(+), 49 deletions(-)
 create mode 100644 tools/testing/selftests/kvm/include/ucall_common.h

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index c860ced3888d..c9286811a4cb 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -8,5 +8,6 @@
 #define SELFTEST_KVM_UTIL_H
 
 #include "kvm_util_base.h"
+#include "ucall_common.h"
 
 #endif /* SELFTEST_KVM_UTIL_H */
diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h
index 749da2c4bab0..1e5ab6a92848 100644
--- a/tools/testing/selftests/kvm/include/kvm_util_base.h
+++ b/tools/testing/selftests/kvm/include/kvm_util_base.h
@@ -389,55 +389,6 @@ int vm_create_device(struct kvm_vm *vm, struct kvm_create_device *cd);
 
 void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid);
 
-/* Common ucalls */
-enum {
-	UCALL_NONE,
-	UCALL_SYNC,
-	UCALL_ABORT,
-	UCALL_DONE,
-	UCALL_UNHANDLED,
-};
-
-#define UCALL_MAX_ARGS 6
-
-struct ucall {
-	uint64_t cmd;
-	uint64_t args[UCALL_MAX_ARGS];
-};
-
-void ucall_init(struct kvm_vm *vm, void *arg);
-void ucall_uninit(struct kvm_vm *vm);
-void ucall(uint64_t cmd, int nargs, ...);
-uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc);
-
-#define GUEST_SYNC_ARGS(stage, arg1, arg2, arg3, arg4)	\
-				ucall(UCALL_SYNC, 6, "hello", stage, arg1, arg2, arg3, arg4)
-#define GUEST_SYNC(stage)	ucall(UCALL_SYNC, 2, "hello", stage)
-#define GUEST_DONE()		ucall(UCALL_DONE, 0)
-#define __GUEST_ASSERT(_condition, _condstr, _nargs, _args...) do {    \
-	if (!(_condition))                                              \
-		ucall(UCALL_ABORT, 2 + _nargs,                          \
-			"Failed guest assert: "                         \
-			_condstr, __LINE__, _args);                     \
-} while (0)
-
-#define GUEST_ASSERT(_condition) \
-	__GUEST_ASSERT(_condition, #_condition, 0, 0)
-
-#define GUEST_ASSERT_1(_condition, arg1) \
-	__GUEST_ASSERT(_condition, #_condition, 1, (arg1))
-
-#define GUEST_ASSERT_2(_condition, arg1, arg2) \
-	__GUEST_ASSERT(_condition, #_condition, 2, (arg1), (arg2))
-
-#define GUEST_ASSERT_3(_condition, arg1, arg2, arg3) \
-	__GUEST_ASSERT(_condition, #_condition, 3, (arg1), (arg2), (arg3))
-
-#define GUEST_ASSERT_4(_condition, arg1, arg2, arg3, arg4) \
-	__GUEST_ASSERT(_condition, #_condition, 4, (arg1), (arg2), (arg3), (arg4))
-
-#define GUEST_ASSERT_EQ(a, b) __GUEST_ASSERT((a) == (b), #a " == " #b, 2, a, b)
-
 int vm_get_stats_fd(struct kvm_vm *vm);
 int vcpu_get_stats_fd(struct kvm_vm *vm, uint32_t vcpuid);
 
diff --git a/tools/testing/selftests/kvm/include/ucall_common.h b/tools/testing/selftests/kvm/include/ucall_common.h
new file mode 100644
index 000000000000..9eecc9d40b79
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/ucall_common.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * tools/testing/selftests/kvm/include/kvm_util.h
+ *
+ * Copyright (C) 2018, Google LLC.
+ */
+#ifndef SELFTEST_KVM_UCALL_COMMON_H
+#define SELFTEST_KVM_UCALL_COMMON_H
+
+/* Common ucalls */
+enum {
+	UCALL_NONE,
+	UCALL_SYNC,
+	UCALL_ABORT,
+	UCALL_DONE,
+	UCALL_UNHANDLED,
+};
+
+#define UCALL_MAX_ARGS 6
+
+struct ucall {
+	uint64_t cmd;
+	uint64_t args[UCALL_MAX_ARGS];
+};
+
+void ucall_init(struct kvm_vm *vm, void *arg);
+void ucall_uninit(struct kvm_vm *vm);
+void ucall(uint64_t cmd, int nargs, ...);
+uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc);
+
+#define GUEST_SYNC_ARGS(stage, arg1, arg2, arg3, arg4)	\
+				ucall(UCALL_SYNC, 6, "hello", stage, arg1, arg2, arg3, arg4)
+#define GUEST_SYNC(stage)	ucall(UCALL_SYNC, 2, "hello", stage)
+#define GUEST_DONE()		ucall(UCALL_DONE, 0)
+#define __GUEST_ASSERT(_condition, _condstr, _nargs, _args...) do {    \
+	if (!(_condition))                                              \
+		ucall(UCALL_ABORT, 2 + _nargs,                          \
+			"Failed guest assert: "                         \
+			_condstr, __LINE__, _args);                     \
+} while (0)
+
+#define GUEST_ASSERT(_condition) \
+	__GUEST_ASSERT(_condition, #_condition, 0, 0)
+
+#define GUEST_ASSERT_1(_condition, arg1) \
+	__GUEST_ASSERT(_condition, #_condition, 1, (arg1))
+
+#define GUEST_ASSERT_2(_condition, arg1, arg2) \
+	__GUEST_ASSERT(_condition, #_condition, 2, (arg1), (arg2))
+
+#define GUEST_ASSERT_3(_condition, arg1, arg2, arg3) \
+	__GUEST_ASSERT(_condition, #_condition, 3, (arg1), (arg2), (arg3))
+
+#define GUEST_ASSERT_4(_condition, arg1, arg2, arg3, arg4) \
+	__GUEST_ASSERT(_condition, #_condition, 4, (arg1), (arg2), (arg3), (arg4))
+
+#define GUEST_ASSERT_EQ(a, b) __GUEST_ASSERT((a) == (b), #a " == " #b, 2, a, b)
+
+#endif /* SELFTEST_KVM_UCALL_COMMON_H */
-- 
cgit v1.2.3


From f06a82f9d31a87878a9295bac1defdadbc77bbc0 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Thu, 6 Jan 2022 23:20:30 +0100
Subject: perf trace: Avoid early exit due to running SIGCHLD handler before it
 makes sense to

When running 'perf trace' with an BPF object like:

  # perf trace -e openat,tools/perf/examples/bpf/hello.c

the event parsing eventually calls llvm__get_kbuild_opts() that runs a
script and that ends up with SIGCHLD delivered to the 'perf trace'
handler, which assumes the workload process is done and quits 'perf
trace'.

Move the SIGCHLD handler setup directly to trace__run(), where the event
is parsed and the object is already compiled.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Christy Lee <christyc.y.lee@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michael Petlan <mpetlan@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lore.kernel.org/lkml/20220106222030.227499-1-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 0b52e08e558e..ef94388e8323 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -3925,6 +3925,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
 	bool draining = false;
 
 	trace->live = true;
+	signal(SIGCHLD, sig_handler);
 
 	if (!trace->raw_augmented_syscalls) {
 		if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
@@ -4873,7 +4874,6 @@ int cmd_trace(int argc, const char **argv)
 
 	signal(SIGSEGV, sighandler_dump_stack);
 	signal(SIGFPE, sighandler_dump_stack);
-	signal(SIGCHLD, sig_handler);
 	signal(SIGINT, sig_handler);
 
 	trace.evlist = evlist__new();
-- 
cgit v1.2.3


From dc9f2dd5de04d2bbcccbabdf5df9715c2ddcf25f Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 7 Jan 2022 16:02:54 -0300
Subject: Revert "libtraceevent: Increase libtraceevent logging when verbose"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 08efcb4a638d260ef7fcbae64ecf7ceceb3f1841.

This breaks the build as it will prefer using libbpf-devel header files,
even when not using LIBBPF_DYNAMIC=1, breaking the build.

This was detected on OpenSuSE Tumbleweed with libtraceevent-devel 1.3.0,
as described by Jiri Slaby:

=======================================================================
It breaks build with LIBTRACEEVENT_DYNAMIC and version 1.3.0:
> util/debug.c: In function ‘perf_debug_option’:
> util/debug.c:243:17: error: implicit declaration of function
‘tep_set_loglevel’ [-Werror=implicit-function-declaration]
>   243 |                 tep_set_loglevel(TEP_LOG_INFO);
>       |                 ^~~~~~~~~~~~~~~~
> util/debug.c:243:34: error: ‘TEP_LOG_INFO’ undeclared (first use in this
function); did you mean ‘TEP_PRINT_INFO’?
>   243 |                 tep_set_loglevel(TEP_LOG_INFO);
>       |                                  ^~~~~~~~~~~~
>       |                                  TEP_PRINT_INFO
> util/debug.c:243:34: note: each undeclared identifier is reported only once
for each function it appears in
> util/debug.c:245:34: error: ‘TEP_LOG_DEBUG’ undeclared (first use in this
function)
>   245 |                 tep_set_loglevel(TEP_LOG_DEBUG);
>       |                                  ^~~~~~~~~~~~~
> util/debug.c:247:34: error: ‘TEP_LOG_ALL’ undeclared (first use in this
function)
>   247 |                 tep_set_loglevel(TEP_LOG_ALL);
>       |                                  ^~~~~~~~~~~

It is because the gcc's command line looks like:
gcc
...
-I/home/abuild/rpmbuild/BUILD/tools/lib/
...
-DLIBTRACEEVENT_VERSION=65790
...
=======================================================================

The proper way to fix this is more involved and so not suitable for this
late in the 5.16-rc stage.

Reported-by: Jiri Slaby <jirislaby@kernel.org>
Link: https://lore.kernel.org/lkml/bc2b0786-8965-1bcd-2316-9d9bb37b9c31@kernel.org
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: https://lore.kernel.org/lkml/YddGjjmlMZzxUZbN@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/debug.c | 19 -------------------
 1 file changed, 19 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c
index c7a9fa0ffae9..2c06abf6dcd2 100644
--- a/tools/perf/util/debug.c
+++ b/tools/perf/util/debug.c
@@ -24,16 +24,6 @@
 #include "util/parse-sublevel-options.h"
 
 #include <linux/ctype.h>
-#include <traceevent/event-parse.h>
-
-#define MAKE_LIBTRACEEVENT_VERSION(a, b, c) ((a)*255*255+(b)*255+(c))
-#ifndef LIBTRACEEVENT_VERSION
-/*
- * If LIBTRACEEVENT_VERSION wasn't computed then set to version 1.1.0 that ships
- * with the Linux kernel tools.
- */
-#define LIBTRACEEVENT_VERSION MAKE_LIBTRACEEVENT_VERSION(1, 1, 0)
-#endif
 
 int verbose;
 int debug_peo_args;
@@ -238,15 +228,6 @@ int perf_debug_option(const char *str)
 	/* Allow only verbose value in range (0, 10), otherwise set 0. */
 	verbose = (verbose < 0) || (verbose > 10) ? 0 : verbose;
 
-#if MAKE_LIBTRACEEVENT_VERSION(1, 3, 0) <= LIBTRACEEVENT_VERSION
-	if (verbose == 1)
-		tep_set_loglevel(TEP_LOG_INFO);
-	else if (verbose == 2)
-		tep_set_loglevel(TEP_LOG_DEBUG);
-	else if (verbose >= 3)
-		tep_set_loglevel(TEP_LOG_ALL);
-#endif
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From 327b9a94e2a805dea5238940c9660ca367dbfdef Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 7 Jan 2022 11:25:22 -0800
Subject: selftests: mptcp: more stable join tests-cases

MPTCP join self-tests are a bit fragile as they reply on
delays instead of events to catch-up with the expected
sockets states.

Replace the delay with state checking where possible and
reduce the number of sleeps in the most complex scenarios.

This will both reduce the tests run-time and will improve
stability.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_join.sh | 120 ++++++++++++++----------
 1 file changed, 68 insertions(+), 52 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 3165bd1a43cc..27d0eb9afdca 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -238,6 +238,45 @@ is_v6()
 	[ -z "${1##*:*}" ]
 }
 
+# $1: ns, $2: port
+wait_local_port_listen()
+{
+	local listener_ns="${1}"
+	local port="${2}"
+
+	local port_hex i
+
+	port_hex="$(printf "%04X" "${port}")"
+	for i in $(seq 10); do
+		ip netns exec "${listener_ns}" cat /proc/net/tcp* | \
+			awk "BEGIN {rc=1} {if (\$2 ~ /:${port_hex}\$/ && \$4 ~ /0A/) {rc=0; exit}} END {exit rc}" &&
+			break
+		sleep 0.1
+	done
+}
+
+rm_addr_count()
+{
+	ns=${1}
+
+	ip netns exec ${ns} nstat -as | grep MPTcpExtRmAddr | awk '{print $2}'
+}
+
+# $1: ns, $2: old rm_addr counter in $ns
+wait_rm_addr()
+{
+	local ns="${1}"
+	local old_cnt="${2}"
+	local cnt
+	local i
+
+	for i in $(seq 10); do
+		cnt=$(rm_addr_count ${ns})
+		[ "$cnt" = "${old_cnt}" ] || break
+		sleep 0.1
+	done
+}
+
 do_transfer()
 {
 	listener_ns="$1"
@@ -307,7 +346,7 @@ do_transfer()
 	fi
 	spid=$!
 
-	sleep 1
+	wait_local_port_listen "${listener_ns}" "${port}"
 
 	if [ "$test_link_fail" -eq 0 ];then
 		timeout ${timeout_test} \
@@ -324,10 +363,13 @@ do_transfer()
 	fi
 	cpid=$!
 
+	# let the mptcp subflow be established in background before
+	# do endpoint manipulation
+	[ $addr_nr_ns1 = "0" -a $addr_nr_ns2 = "0" ] || sleep 1
+
 	if [ $addr_nr_ns1 -gt 0 ]; then
 		let add_nr_ns1=addr_nr_ns1
 		counter=2
-		sleep 1
 		while [ $add_nr_ns1 -gt 0 ]; do
 			local addr
 			if is_v6 "${connect_addr}"; then
@@ -339,7 +381,6 @@ do_transfer()
 			let counter+=1
 			let add_nr_ns1-=1
 		done
-		sleep 1
 	elif [ $addr_nr_ns1 -lt 0 ]; then
 		let rm_nr_ns1=-addr_nr_ns1
 		if [ $rm_nr_ns1 -lt 8 ]; then
@@ -347,22 +388,19 @@ do_transfer()
 			pos=1
 			dump=(`ip netns exec ${listener_ns} ./pm_nl_ctl dump`)
 			if [ ${#dump[@]} -gt 0 ]; then
-				sleep 1
-
 				while [ $counter -le $rm_nr_ns1 ]
 				do
 					id=${dump[$pos]}
+					rm_addr=$(rm_addr_count ${connector_ns})
 					ip netns exec ${listener_ns} ./pm_nl_ctl del $id
-					sleep 1
+					wait_rm_addr ${connector_ns} ${rm_addr}
 					let counter+=1
 					let pos+=5
 				done
 			fi
 		elif [ $rm_nr_ns1 -eq 8 ]; then
-			sleep 1
 			ip netns exec ${listener_ns} ./pm_nl_ctl flush
 		elif [ $rm_nr_ns1 -eq 9 ]; then
-			sleep 1
 			ip netns exec ${listener_ns} ./pm_nl_ctl del 0 ${connect_addr}
 		fi
 	fi
@@ -373,10 +411,13 @@ do_transfer()
 		addr_nr_ns2=${addr_nr_ns2:9}
 	fi
 
+	# if newly added endpoints must be deleted, give the background msk
+	# some time to created them
+	[ $addr_nr_ns1 -gt 0 -a $addr_nr_ns2 -lt 0 ] && sleep 1
+
 	if [ $addr_nr_ns2 -gt 0 ]; then
 		let add_nr_ns2=addr_nr_ns2
 		counter=3
-		sleep 1
 		while [ $add_nr_ns2 -gt 0 ]; do
 			local addr
 			if is_v6 "${connect_addr}"; then
@@ -388,7 +429,6 @@ do_transfer()
 			let counter+=1
 			let add_nr_ns2-=1
 		done
-		sleep 1
 	elif [ $addr_nr_ns2 -lt 0 ]; then
 		let rm_nr_ns2=-addr_nr_ns2
 		if [ $rm_nr_ns2 -lt 8 ]; then
@@ -396,19 +436,18 @@ do_transfer()
 			pos=1
 			dump=(`ip netns exec ${connector_ns} ./pm_nl_ctl dump`)
 			if [ ${#dump[@]} -gt 0 ]; then
-				sleep 1
-
 				while [ $counter -le $rm_nr_ns2 ]
 				do
+					# rm_addr are serialized, allow the previous one to complete
 					id=${dump[$pos]}
+					rm_addr=$(rm_addr_count ${listener_ns})
 					ip netns exec ${connector_ns} ./pm_nl_ctl del $id
-					sleep 1
+					wait_rm_addr ${listener_ns} ${rm_addr}
 					let counter+=1
 					let pos+=5
 				done
 			fi
 		elif [ $rm_nr_ns2 -eq 8 ]; then
-			sleep 1
 			ip netns exec ${connector_ns} ./pm_nl_ctl flush
 		elif [ $rm_nr_ns2 -eq 9 ]; then
 			local addr
@@ -417,7 +456,6 @@ do_transfer()
 			else
 				addr="10.0.1.2"
 			fi
-			sleep 1
 			ip netns exec ${connector_ns} ./pm_nl_ctl del 0 $addr
 		fi
 	fi
@@ -539,6 +577,14 @@ run_tests()
 	lret=$?
 }
 
+dump_stats()
+{
+	echo Server ns stats
+	ip netns exec $ns1 nstat -as | grep Tcp
+	echo Client ns stats
+	ip netns exec $ns2 nstat -as | grep Tcp
+}
+
 chk_csum_nr()
 {
 	local msg=${1:-""}
@@ -570,12 +616,7 @@ chk_csum_nr()
 	else
 		echo "[ ok ]"
 	fi
-	if [ "${dump_stats}" = 1 ]; then
-		echo Server ns stats
-		ip netns exec $ns1 nstat -as | grep MPTcp
-		echo Client ns stats
-		ip netns exec $ns2 nstat -as | grep MPTcp
-	fi
+	[ "${dump_stats}" = 1 ] && dump_stats
 }
 
 chk_fail_nr()
@@ -607,12 +648,7 @@ chk_fail_nr()
 		echo "[ ok ]"
 	fi
 
-	if [ "${dump_stats}" = 1 ]; then
-		echo Server ns stats
-		ip netns exec $ns1 nstat -as | grep MPTcp
-		echo Client ns stats
-		ip netns exec $ns2 nstat -as | grep MPTcp
-	fi
+	[ "${dump_stats}" = 1 ] && dump_stats
 }
 
 chk_join_nr()
@@ -656,12 +692,7 @@ chk_join_nr()
 	else
 		echo "[ ok ]"
 	fi
-	if [ "${dump_stats}" = 1 ]; then
-		echo Server ns stats
-		ip netns exec $ns1 nstat -as | grep MPTcp
-		echo Client ns stats
-		ip netns exec $ns2 nstat -as | grep MPTcp
-	fi
+	[ "${dump_stats}" = 1 ] && dump_stats
 	if [ $checksum -eq 1 ]; then
 		chk_csum_nr
 		chk_fail_nr 0 0
@@ -823,12 +854,7 @@ chk_add_nr()
 		echo ""
 	fi
 
-	if [ "${dump_stats}" = 1 ]; then
-		echo Server ns stats
-		ip netns exec $ns1 nstat -as | grep MPTcp
-		echo Client ns stats
-		ip netns exec $ns2 nstat -as | grep MPTcp
-	fi
+	[ "${dump_stats}" = 1 ] && dump_stats
 }
 
 chk_rm_nr()
@@ -871,12 +897,7 @@ chk_rm_nr()
 		echo "[ ok ]"
 	fi
 
-	if [ "${dump_stats}" = 1 ]; then
-		echo Server ns stats
-		ip netns exec $ns1 nstat -as | grep MPTcp
-		echo Client ns stats
-		ip netns exec $ns2 nstat -as | grep MPTcp
-	fi
+	[ "${dump_stats}" = 1 ] && dump_stats
 }
 
 chk_prio_nr()
@@ -908,12 +929,7 @@ chk_prio_nr()
 		echo "[ ok ]"
 	fi
 
-	if [ "${dump_stats}" = 1 ]; then
-		echo Server ns stats
-		ip netns exec $ns1 nstat -as | grep MPTcp
-		echo Client ns stats
-		ip netns exec $ns2 nstat -as | grep MPTcp
-	fi
+	[ "${dump_stats}" = 1 ] && dump_stats
 }
 
 chk_link_usage()
@@ -1651,7 +1667,7 @@ add_addr_ports_tests()
 	ip netns exec $ns2 ./pm_nl_ctl limits 1 3
 	ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
 	ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow
-	run_tests $ns1 $ns2 10.0.1.1 0 -8 -8 slow
+	run_tests $ns1 $ns2 10.0.1.1 0 -8 -2 slow
 	chk_join_nr "flush subflows and signal with port" 3 3 3
 	chk_add_nr 1 1
 	chk_rm_nr 2 2
-- 
cgit v1.2.3


From e3304c21357268ecbe156ed6247a03dc78d3fce4 Mon Sep 17 00:00:00 2001
From: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Date: Fri, 3 Dec 2021 07:50:37 +0530
Subject: perf sort: Include global and local variants for p_stage_cyc sort key

Sort key 'p_stage_cyc' is used to present the latency cycles spent in
pipeline stages.

perf has local 'p_stage_cyc' sort key to display this info. There is no
global variant available for this sort key. The local variant shows
latency in a single sample, whereas the global value will be useful to
present the total latency (sum of latencies) in the hist entry. It
represents the latency number multiplied by the number of samples.

Add global ('p_stage_cyc') and local variant ('local_p_stage_cyc') for
this sort key. Use 'local_p_stage_cyc' as default option for "mem" sort
mode.

Also add this to the list of dynamic sort keys and made the
"dynamic_headers" and "arch_specific_sort_keys" as static.

Reported-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Athira Jajeev <atrajeev@linux.vnet.ibm.com>
Tested-by: Nageswara R Sastry <rnsastry@linux.ibm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: linuxppc-dev@lists.ozlabs.org
Link: https://lore.kernel.org/r/20211203022038.48240-1-atrajeev@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/hist.c |  4 +++-
 tools/perf/util/hist.h |  3 ++-
 tools/perf/util/sort.c | 34 +++++++++++++++++++++++++---------
 tools/perf/util/sort.h |  3 ++-
 4 files changed, 32 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index b776465e04ef..0a8033b09e28 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -211,7 +211,9 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
 	hists__new_col_len(hists, HISTC_MEM_BLOCKED, 10);
 	hists__new_col_len(hists, HISTC_LOCAL_INS_LAT, 13);
 	hists__new_col_len(hists, HISTC_GLOBAL_INS_LAT, 13);
-	hists__new_col_len(hists, HISTC_P_STAGE_CYC, 13);
+	hists__new_col_len(hists, HISTC_LOCAL_P_STAGE_CYC, 13);
+	hists__new_col_len(hists, HISTC_GLOBAL_P_STAGE_CYC, 13);
+
 	if (symbol_conf.nanosecs)
 		hists__new_col_len(hists, HISTC_TIME, 16);
 	else
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 621f35ae1efa..2a15e22fb89c 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -75,7 +75,8 @@ enum hist_column {
 	HISTC_MEM_BLOCKED,
 	HISTC_LOCAL_INS_LAT,
 	HISTC_GLOBAL_INS_LAT,
-	HISTC_P_STAGE_CYC,
+	HISTC_LOCAL_P_STAGE_CYC,
+	HISTC_GLOBAL_P_STAGE_CYC,
 	HISTC_NR_COLS, /* Last entry */
 };
 
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index a111065b484e..e417e47f51b9 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -37,7 +37,7 @@ const char	default_parent_pattern[] = "^sys_|^do_page_fault";
 const char	*parent_pattern = default_parent_pattern;
 const char	*default_sort_order = "comm,dso,symbol";
 const char	default_branch_sort_order[] = "comm,dso_from,symbol_from,symbol_to,cycles";
-const char	default_mem_sort_order[] = "local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked,blocked,local_ins_lat,p_stage_cyc";
+const char	default_mem_sort_order[] = "local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked,blocked,local_ins_lat,local_p_stage_cyc";
 const char	default_top_sort_order[] = "dso,symbol";
 const char	default_diff_sort_order[] = "dso,symbol";
 const char	default_tracepoint_sort_order[] = "trace";
@@ -46,8 +46,8 @@ const char	*field_order;
 regex_t		ignore_callees_regex;
 int		have_ignore_callees = 0;
 enum sort_mode	sort__mode = SORT_MODE__NORMAL;
-const char	*dynamic_headers[] = {"local_ins_lat", "p_stage_cyc"};
-const char	*arch_specific_sort_keys[] = {"p_stage_cyc"};
+static const char *const dynamic_headers[] = {"local_ins_lat", "ins_lat", "local_p_stage_cyc", "p_stage_cyc"};
+static const char *const arch_specific_sort_keys[] = {"local_p_stage_cyc", "p_stage_cyc"};
 
 /*
  * Replaces all occurrences of a char used with the:
@@ -1392,22 +1392,37 @@ struct sort_entry sort_global_ins_lat = {
 };
 
 static int64_t
-sort__global_p_stage_cyc_cmp(struct hist_entry *left, struct hist_entry *right)
+sort__p_stage_cyc_cmp(struct hist_entry *left, struct hist_entry *right)
 {
 	return left->p_stage_cyc - right->p_stage_cyc;
 }
 
+static int hist_entry__global_p_stage_cyc_snprintf(struct hist_entry *he, char *bf,
+					size_t size, unsigned int width)
+{
+	return repsep_snprintf(bf, size, "%-*u", width,
+			he->p_stage_cyc * he->stat.nr_events);
+}
+
+
 static int hist_entry__p_stage_cyc_snprintf(struct hist_entry *he, char *bf,
 					size_t size, unsigned int width)
 {
 	return repsep_snprintf(bf, size, "%-*u", width, he->p_stage_cyc);
 }
 
-struct sort_entry sort_p_stage_cyc = {
-	.se_header      = "Pipeline Stage Cycle",
-	.se_cmp         = sort__global_p_stage_cyc_cmp,
+struct sort_entry sort_local_p_stage_cyc = {
+	.se_header      = "Local Pipeline Stage Cycle",
+	.se_cmp         = sort__p_stage_cyc_cmp,
 	.se_snprintf	= hist_entry__p_stage_cyc_snprintf,
-	.se_width_idx	= HISTC_P_STAGE_CYC,
+	.se_width_idx	= HISTC_LOCAL_P_STAGE_CYC,
+};
+
+struct sort_entry sort_global_p_stage_cyc = {
+	.se_header      = "Pipeline Stage Cycle",
+	.se_cmp         = sort__p_stage_cyc_cmp,
+	.se_snprintf    = hist_entry__global_p_stage_cyc_snprintf,
+	.se_width_idx   = HISTC_GLOBAL_P_STAGE_CYC,
 };
 
 struct sort_entry sort_mem_daddr_sym = {
@@ -1858,7 +1873,8 @@ static struct sort_dimension common_sort_dimensions[] = {
 	DIM(SORT_CODE_PAGE_SIZE, "code_page_size", sort_code_page_size),
 	DIM(SORT_LOCAL_INS_LAT, "local_ins_lat", sort_local_ins_lat),
 	DIM(SORT_GLOBAL_INS_LAT, "ins_lat", sort_global_ins_lat),
-	DIM(SORT_PIPELINE_STAGE_CYC, "p_stage_cyc", sort_p_stage_cyc),
+	DIM(SORT_LOCAL_PIPELINE_STAGE_CYC, "local_p_stage_cyc", sort_local_p_stage_cyc),
+	DIM(SORT_GLOBAL_PIPELINE_STAGE_CYC, "p_stage_cyc", sort_global_p_stage_cyc),
 };
 
 #undef DIM
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 7b7145501933..f994261888e1 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -235,7 +235,8 @@ enum sort_type {
 	SORT_CODE_PAGE_SIZE,
 	SORT_LOCAL_INS_LAT,
 	SORT_GLOBAL_INS_LAT,
-	SORT_PIPELINE_STAGE_CYC,
+	SORT_LOCAL_PIPELINE_STAGE_CYC,
+	SORT_GLOBAL_PIPELINE_STAGE_CYC,
 
 	/* branch stack specific sort keys */
 	__SORT_BRANCH_STACK,
-- 
cgit v1.2.3


From befee3775b6dabd7ec1bd8a44584f7f6f8fc8329 Mon Sep 17 00:00:00 2001
From: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Date: Fri, 3 Dec 2021 07:50:38 +0530
Subject: perf powerpc: Update global/local variants for p_stage_cyc

Update the arch_support_sort_key() function in powerpc to enable
presenting local and global variants of sort key 'p_stage_cyc'.

Update the "se_header" strings for these in arch_perf_header_entry()
along with instruction latency.

Reported-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Athira Jajeev <atrajeev@linux.vnet.ibm.com>
Tested-by: Nageswara R Sastry <rnsastry@linux.ibm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: linuxppc-dev@lists.ozlabs.org
Link: https://lore.kernel.org/r/20211203022038.48240-2-atrajeev@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/powerpc/util/event.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/arch/powerpc/util/event.c b/tools/perf/arch/powerpc/util/event.c
index 3bf441257466..cf430a4c55b9 100644
--- a/tools/perf/arch/powerpc/util/event.c
+++ b/tools/perf/arch/powerpc/util/event.c
@@ -40,8 +40,12 @@ const char *arch_perf_header_entry(const char *se_header)
 {
 	if (!strcmp(se_header, "Local INSTR Latency"))
 		return "Finish Cyc";
-	else if (!strcmp(se_header, "Pipeline Stage Cycle"))
+	else if (!strcmp(se_header, "INSTR Latency"))
+		return "Global Finish_cyc";
+	else if (!strcmp(se_header, "Local Pipeline Stage Cycle"))
 		return "Dispatch Cyc";
+	else if (!strcmp(se_header, "Pipeline Stage Cycle"))
+		return "Global Dispatch_cyc";
 	return se_header;
 }
 
@@ -49,5 +53,7 @@ int arch_support_sort_key(const char *sort_key)
 {
 	if (!strcmp(sort_key, "p_stage_cyc"))
 		return 1;
+	if (!strcmp(sort_key, "local_p_stage_cyc"))
+		return 1;
 	return 0;
 }
-- 
cgit v1.2.3


From d5962fb7d69073bf68fb647531cfd4f0adf84be3 Mon Sep 17 00:00:00 2001
From: Dario Petrillo <dario.pk1@gmail.com>
Date: Mon, 10 Jan 2022 00:44:41 +0100
Subject: perf annotate: Avoid TUI crash when navigating in the annotation of
 recursive functions

In 'perf report', entering a recursive function from inside of itself
(either directly of indirectly through some other function) results in
calling symbol__annotate2 multiple() times, and freeing the whole
disassembly when exiting from the innermost instance.

The first issue causes the function's disassembly to be duplicated, and
the latter a heap use-after-free (and crash) when trying to access the
disassembly again.

I reproduced the bug on perf 5.11.22 (Ubuntu 20.04.3 LTS) and 5.16.rc8
with the following testcase (compile with gcc recursive.c -o recursive).
To reproduce:

- perf record ./recursive
- perf report
- enter fibonacci and annotate it
- move the cursor on one of the "callq fibonacci" instructions and press enter
  - at this point there will be two copies of the function in the disassembly
- go back by pressing q, and perf will crash

  #include <stdio.h>

  int fibonacci(int n)
  {
      if(n <= 2) return 1;
      return fibonacci(n-1) + fibonacci(n-2);
  }

  int main()
  {
      printf("%d\n", fibonacci(40));
  }

This patch addresses the issue by annotating a function and freeing the
associated memory on exit only if no annotation is already present, so
that a recursive function is only annotated on entry.

Signed-off-by: Dario Petrillo <dario.pk1@gmail.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: stable@kernel.org
Link: http://lore.kernel.org/lkml/20220109234441.325106-1-dario.pk1@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/ui/browsers/annotate.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c
index e81c2493efdf..44ba900828f6 100644
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c
@@ -966,6 +966,7 @@ int symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel,
 		.opts = opts,
 	};
 	int ret = -1, err;
+	int not_annotated = list_empty(&notes->src->source);
 
 	if (sym == NULL)
 		return -1;
@@ -973,13 +974,15 @@ int symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel,
 	if (ms->map->dso->annotate_warned)
 		return -1;
 
-	err = symbol__annotate2(ms, evsel, opts, &browser.arch);
-	if (err) {
-		char msg[BUFSIZ];
-		ms->map->dso->annotate_warned = true;
-		symbol__strerror_disassemble(ms, err, msg, sizeof(msg));
-		ui__error("Couldn't annotate %s:\n%s", sym->name, msg);
-		goto out_free_offsets;
+	if (not_annotated) {
+		err = symbol__annotate2(ms, evsel, opts, &browser.arch);
+		if (err) {
+			char msg[BUFSIZ];
+			ms->map->dso->annotate_warned = true;
+			symbol__strerror_disassemble(ms, err, msg, sizeof(msg));
+			ui__error("Couldn't annotate %s:\n%s", sym->name, msg);
+			goto out_free_offsets;
+		}
 	}
 
 	ui_helpline__push("Press ESC to exit");
@@ -994,9 +997,11 @@ int symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel,
 
 	ret = annotate_browser__run(&browser, evsel, hbt);
 
-	annotated_source__purge(notes->src);
+	if(not_annotated)
+		annotated_source__purge(notes->src);
 
 out_free_offsets:
-	zfree(&notes->offsets);
+	if(not_annotated)
+		zfree(&notes->offsets);
 	return ret;
 }
-- 
cgit v1.2.3


From 4b27480dcaa71e7ee9f56907e419c6a1511fd2b2 Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@redhat.com>
Date: Fri, 7 Jan 2022 23:11:14 +0100
Subject: bpf/selftests: convert xdp_link test to ASSERT_* macros
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Convert the selftest to use the preferred ASSERT_* macros instead of the
deprecated CHECK().

v2:
- Don't add if statements around checks if they weren't there before.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/r/20220107221115.326171-2-toke@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/xdp_link.c | 56 ++++++++++-------------
 1 file changed, 25 insertions(+), 31 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_link.c b/tools/testing/selftests/bpf/prog_tests/xdp_link.c
index 983ab0b47d30..eec0bf83546b 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_link.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_link.c
@@ -8,46 +8,47 @@
 
 void serial_test_xdp_link(void)
 {
-	__u32 duration = 0, id1, id2, id0 = 0, prog_fd1, prog_fd2, err;
 	DECLARE_LIBBPF_OPTS(bpf_xdp_set_link_opts, opts, .old_fd = -1);
 	struct test_xdp_link *skel1 = NULL, *skel2 = NULL;
+	__u32 id1, id2, id0 = 0, prog_fd1, prog_fd2;
 	struct bpf_link_info link_info;
 	struct bpf_prog_info prog_info;
 	struct bpf_link *link;
+	int err;
 	__u32 link_info_len = sizeof(link_info);
 	__u32 prog_info_len = sizeof(prog_info);
 
 	skel1 = test_xdp_link__open_and_load();
-	if (CHECK(!skel1, "skel_load", "skeleton open and load failed\n"))
+	if (!ASSERT_OK_PTR(skel1, "skel_load"))
 		goto cleanup;
 	prog_fd1 = bpf_program__fd(skel1->progs.xdp_handler);
 
 	skel2 = test_xdp_link__open_and_load();
-	if (CHECK(!skel2, "skel_load", "skeleton open and load failed\n"))
+	if (!ASSERT_OK_PTR(skel2, "skel_load"))
 		goto cleanup;
 	prog_fd2 = bpf_program__fd(skel2->progs.xdp_handler);
 
 	memset(&prog_info, 0, sizeof(prog_info));
 	err = bpf_obj_get_info_by_fd(prog_fd1, &prog_info, &prog_info_len);
-	if (CHECK(err, "fd_info1", "failed %d\n", -errno))
+	if (!ASSERT_OK(err, "fd_info1"))
 		goto cleanup;
 	id1 = prog_info.id;
 
 	memset(&prog_info, 0, sizeof(prog_info));
 	err = bpf_obj_get_info_by_fd(prog_fd2, &prog_info, &prog_info_len);
-	if (CHECK(err, "fd_info2", "failed %d\n", -errno))
+	if (!ASSERT_OK(err, "fd_info2"))
 		goto cleanup;
 	id2 = prog_info.id;
 
 	/* set initial prog attachment */
 	err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, prog_fd1, XDP_FLAGS_REPLACE, &opts);
-	if (CHECK(err, "fd_attach", "initial prog attach failed: %d\n", err))
+	if (!ASSERT_OK(err, "fd_attach"))
 		goto cleanup;
 
 	/* validate prog ID */
 	err = bpf_get_link_xdp_id(IFINDEX_LO, &id0, 0);
-	CHECK(err || id0 != id1, "id1_check",
-	      "loaded prog id %u != id1 %u, err %d", id0, id1, err);
+	if (!ASSERT_OK(err, "id1_check_err") || !ASSERT_EQ(id0, id1, "id1_check_val"))
+		goto cleanup;
 
 	/* BPF link is not allowed to replace prog attachment */
 	link = bpf_program__attach_xdp(skel1->progs.xdp_handler, IFINDEX_LO);
@@ -62,7 +63,7 @@ void serial_test_xdp_link(void)
 	/* detach BPF program */
 	opts.old_fd = prog_fd1;
 	err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, -1, XDP_FLAGS_REPLACE, &opts);
-	if (CHECK(err, "prog_detach", "failed %d\n", err))
+	if (!ASSERT_OK(err, "prog_detach"))
 		goto cleanup;
 
 	/* now BPF link should attach successfully */
@@ -73,24 +74,23 @@ void serial_test_xdp_link(void)
 
 	/* validate prog ID */
 	err = bpf_get_link_xdp_id(IFINDEX_LO, &id0, 0);
-	if (CHECK(err || id0 != id1, "id1_check",
-		  "loaded prog id %u != id1 %u, err %d", id0, id1, err))
+	if (!ASSERT_OK(err, "id1_check_err") || !ASSERT_EQ(id0, id1, "id1_check_val"))
 		goto cleanup;
 
 	/* BPF prog attach is not allowed to replace BPF link */
 	opts.old_fd = prog_fd1;
 	err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, prog_fd2, XDP_FLAGS_REPLACE, &opts);
-	if (CHECK(!err, "prog_attach_fail", "unexpected success\n"))
+	if (!ASSERT_ERR(err, "prog_attach_fail"))
 		goto cleanup;
 
 	/* Can't force-update when BPF link is active */
 	err = bpf_set_link_xdp_fd(IFINDEX_LO, prog_fd2, 0);
-	if (CHECK(!err, "prog_update_fail", "unexpected success\n"))
+	if (!ASSERT_ERR(err, "prog_update_fail"))
 		goto cleanup;
 
 	/* Can't force-detach when BPF link is active */
 	err = bpf_set_link_xdp_fd(IFINDEX_LO, -1, 0);
-	if (CHECK(!err, "prog_detach_fail", "unexpected success\n"))
+	if (!ASSERT_ERR(err, "prog_detach_fail"))
 		goto cleanup;
 
 	/* BPF link is not allowed to replace another BPF link */
@@ -110,40 +110,34 @@ void serial_test_xdp_link(void)
 	skel2->links.xdp_handler = link;
 
 	err = bpf_get_link_xdp_id(IFINDEX_LO, &id0, 0);
-	if (CHECK(err || id0 != id2, "id2_check",
-		  "loaded prog id %u != id2 %u, err %d", id0, id1, err))
+	if (!ASSERT_OK(err, "id2_check_err") || !ASSERT_EQ(id0, id2, "id2_check_val"))
 		goto cleanup;
 
 	/* updating program under active BPF link works as expected */
 	err = bpf_link__update_program(link, skel1->progs.xdp_handler);
-	if (CHECK(err, "link_upd", "failed: %d\n", err))
+	if (!ASSERT_OK(err, "link_upd"))
 		goto cleanup;
 
 	memset(&link_info, 0, sizeof(link_info));
 	err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &link_info, &link_info_len);
-	if (CHECK(err, "link_info", "failed: %d\n", err))
+	if (!ASSERT_OK(err, "link_info"))
 		goto cleanup;
 
-	CHECK(link_info.type != BPF_LINK_TYPE_XDP, "link_type",
-	      "got %u != exp %u\n", link_info.type, BPF_LINK_TYPE_XDP);
-	CHECK(link_info.prog_id != id1, "link_prog_id",
-	      "got %u != exp %u\n", link_info.prog_id, id1);
-	CHECK(link_info.xdp.ifindex != IFINDEX_LO, "link_ifindex",
-	      "got %u != exp %u\n", link_info.xdp.ifindex, IFINDEX_LO);
+	ASSERT_EQ(link_info.type, BPF_LINK_TYPE_XDP, "link_type");
+	ASSERT_EQ(link_info.prog_id, id1, "link_prog_id");
+	ASSERT_EQ(link_info.xdp.ifindex, IFINDEX_LO, "link_ifindex");
 
 	err = bpf_link__detach(link);
-	if (CHECK(err, "link_detach", "failed %d\n", err))
+	if (!ASSERT_OK(err, "link_detach"))
 		goto cleanup;
 
 	memset(&link_info, 0, sizeof(link_info));
 	err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &link_info, &link_info_len);
-	if (CHECK(err, "link_info", "failed: %d\n", err))
-		goto cleanup;
-	CHECK(link_info.prog_id != id1, "link_prog_id",
-	      "got %u != exp %u\n", link_info.prog_id, id1);
+
+	ASSERT_OK(err, "link_info");
+	ASSERT_EQ(link_info.prog_id, id1, "link_prog_id");
 	/* ifindex should be zeroed out */
-	CHECK(link_info.xdp.ifindex != 0, "link_ifindex",
-	      "got %u != exp %u\n", link_info.xdp.ifindex, 0);
+	ASSERT_EQ(link_info.xdp.ifindex, 0, "link_ifindex");
 
 cleanup:
 	test_xdp_link__destroy(skel1);
-- 
cgit v1.2.3


From 036a05f50bd777134b1955f400e8d24c0149fef4 Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@redhat.com>
Date: Fri, 7 Jan 2022 23:11:15 +0100
Subject: bpf/selftests: Add check for updating XDP bpf_link with wrong program
 type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a check to the xdp_link selftest that the kernel rejects replacing an
XDP program with a different program type on link update.

v2:
- Split this out into its own patch.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/r/20220107221115.326171-3-toke@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/xdp_link.c | 5 +++++
 tools/testing/selftests/bpf/progs/test_xdp_link.c | 6 ++++++
 2 files changed, 11 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_link.c b/tools/testing/selftests/bpf/prog_tests/xdp_link.c
index eec0bf83546b..b2b357f8c74c 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_link.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_link.c
@@ -127,6 +127,11 @@ void serial_test_xdp_link(void)
 	ASSERT_EQ(link_info.prog_id, id1, "link_prog_id");
 	ASSERT_EQ(link_info.xdp.ifindex, IFINDEX_LO, "link_ifindex");
 
+	/* updating program under active BPF link with different type fails */
+	err = bpf_link__update_program(link, skel1->progs.tc_handler);
+	if (!ASSERT_ERR(err, "link_upd_invalid"))
+		goto cleanup;
+
 	err = bpf_link__detach(link);
 	if (!ASSERT_OK(err, "link_detach"))
 		goto cleanup;
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_link.c b/tools/testing/selftests/bpf/progs/test_xdp_link.c
index ee7d6ac0f615..64ff32eaae92 100644
--- a/tools/testing/selftests/bpf/progs/test_xdp_link.c
+++ b/tools/testing/selftests/bpf/progs/test_xdp_link.c
@@ -10,3 +10,9 @@ int xdp_handler(struct xdp_md *xdp)
 {
 	return 0;
 }
+
+SEC("tc")
+int tc_handler(struct __sk_buff *skb)
+{
+	return 0;
+}
-- 
cgit v1.2.3


From 0046686da0ef692a6381260c3aa44291187eafc9 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 23 Dec 2021 10:39:47 -0800
Subject: perf test: Enable system wide for metricgroups test

Uncore events as group leaders fail in per-thread mode causing exit
errors. Enable system-wide for metricgroup testing. This fixes the HPC
metric group when tested on skylakex.

Fixes: 4a87dea9e60fe100 ("perf test: Workload test of metric and metricgroups")
Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20211223183948.3423989-1-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/stat_all_metricgroups.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/stat_all_metricgroups.sh b/tools/perf/tests/shell/stat_all_metricgroups.sh
index de24d374ce24..cb35e488809a 100755
--- a/tools/perf/tests/shell/stat_all_metricgroups.sh
+++ b/tools/perf/tests/shell/stat_all_metricgroups.sh
@@ -6,7 +6,7 @@ set -e
 
 for m in $(perf list --raw-dump metricgroups); do
   echo "Testing $m"
-  perf stat -M "$m" true
+  perf stat -M "$m" -a true
 done
 
 exit 0
-- 
cgit v1.2.3


From 62942e9fda9fd1def10ffcbd5e1c025b3c9eec17 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Wed, 12 Jan 2022 10:50:57 +0200
Subject: perf script: Fix hex dump character output

Using grep -C with perf script -D can give erroneous results as grep loses
lines due to non-printable characters, for example, below the 0020, 0060
and 0070 lines are missing:

 $ perf script -D | grep -C10 AUX | head
 .  0010:  08 00 00 00 00 00 00 00 1f 00 00 00 00 00 00 00  ................
 .  0030:  01 00 00 00 00 00 00 00 00 04 00 00 00 00 00 00  ................
 .  0040:  00 08 00 00 00 00 00 00 02 00 00 00 00 00 00 00  ................
 .  0050:  00 00 00 00 00 00 00 00 01 00 00 00 00 00 00 00  ................
 .  0080:  02 00 00 00 00 00 00 00 1b 00 00 00 00 00 00 00  ................
 .  0090:  00 00 00 00 00 00 00 00                          ........

 0 0 0x450 [0x98]: PERF_RECORD_AUXTRACE_INFO type: 1
   PMU Type            8
   Time Shift          31

perf's isprint() is a custom implementation from the kernel, but the
kernel's _ctype appears to include characters from Latin-1 Supplement which
is not compatible with, for example, UTF-8. Fix by checking also isascii().

After:

 $ tools/perf/perf script -D | grep -C10 AUX | head
 .  0010:  08 00 00 00 00 00 00 00 1f 00 00 00 00 00 00 00  ................
 .  0020:  03 84 32 2f 00 00 00 00 63 7c 4f d2 fa ff ff ff  ..2/....c|O.....
 .  0030:  01 00 00 00 00 00 00 00 00 04 00 00 00 00 00 00  ................
 .  0040:  00 08 00 00 00 00 00 00 02 00 00 00 00 00 00 00  ................
 .  0050:  00 00 00 00 00 00 00 00 01 00 00 00 00 00 00 00  ................
 .  0060:  00 02 00 00 00 00 00 00 00 c0 03 00 00 00 00 00  ................
 .  0070:  e2 00 00 00 00 00 00 00 02 00 00 00 00 00 00 00  ................
 .  0080:  02 00 00 00 00 00 00 00 1b 00 00 00 00 00 00 00  ................
 .  0090:  00 00 00 00 00 00 00 00                          ........

Fixes: 3052ba56bcb58904 ("tools perf: Move from sane_ctype.h obtained from git to the Linux's original")
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Link: http://lore.kernel.org/lkml/20220112085057.277205-1-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/debug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c
index 2c06abf6dcd2..65e6c22f38e4 100644
--- a/tools/perf/util/debug.c
+++ b/tools/perf/util/debug.c
@@ -179,7 +179,7 @@ static int trace_event_printer(enum binary_printer_ops op,
 		break;
 	case BINARY_PRINT_CHAR_DATA:
 		printed += color_fprintf(fp, color, "%c",
-			      isprint(ch) ? ch : '.');
+			      isprint(ch) && isascii(ch) ? ch : '.');
 		break;
 	case BINARY_PRINT_CHAR_PAD:
 		printed += color_fprintf(fp, color, " ");
-- 
cgit v1.2.3


From dcffc5ebb80dd5887b91091b8ecd082c9ed75361 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 23 Dec 2021 10:39:48 -0800
Subject: perf evsel: Improve error message for uncore events

When a group has multiple events and the leader fails it can yield
errors like:

  $ perf stat -e '{uncore_imc/cas_count_read/},instructions' /bin/true
  Error:
  The sys_perf_event_open() syscall returned with 22 (Invalid argument) for event (uncore_imc/cas_count_read/).
  /bin/dmesg | grep -i perf may provide additional information.

However, when not the group leader <not supported> is given:

  $ perf stat -e '{instructions,uncore_imc/cas_count_read/}' /bin/true
  ...
           1,619,057      instructions
     <not supported> MiB  uncore_imc/cas_count_read/

This is necessary because get_group_fd will fail if the leader fails and
is the direct result of the check on line 750 of builtin-stat.c in
stat_handle_error that returns COUNTER_SKIP for the latter case.

This patch improves the error message to:

  $ perf stat -e '{uncore_imc/cas_count_read/},instructions' /bin/true
  Error:
  Invalid event (uncore_imc/cas_count_read/) in per-thread mode, enable system wide with '-a'.

v2. Changed the test to use !target__has_cpu as suggested by Namhyung Kim.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20211223183948.3423989-2-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evsel.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 656c30b988ce..a0acf53a2510 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -2931,6 +2931,10 @@ int evsel__open_strerror(struct evsel *evsel, struct target *target,
 			return scnprintf(msg, size, "wrong clockid (%d).", clockid);
 		if (perf_missing_features.aux_output)
 			return scnprintf(msg, size, "The 'aux_output' feature is not supported, update the kernel.");
+		if (!target__has_cpu(target))
+			return scnprintf(msg, size,
+	"Invalid event (%s) in per-thread mode, enable system wide with '-a'.",
+					evsel__name(evsel));
 		break;
 	case ENODATA:
 		return scnprintf(msg, size, "Cannot collect data source with the load latency event alone. "
-- 
cgit v1.2.3


From 818ab78c03aad94fabc18d386e9c73b539a1f447 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:04 -0800
Subject: libperf: Add comments to 'struct perf_cpu_map'

A particular observed problem is confusing the index with the CPU value,
documentation should hopefully reduce this type of problem.

Reviewed-by: James Clark <james.clark@arm.com>
Reviewed-by: John Garry <john.garry@huawei.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-2-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/perf/include/internal/cpumap.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/perf/include/internal/cpumap.h b/tools/lib/perf/include/internal/cpumap.h
index 840d4032587b..4054169c12c5 100644
--- a/tools/lib/perf/include/internal/cpumap.h
+++ b/tools/lib/perf/include/internal/cpumap.h
@@ -4,9 +4,18 @@
 
 #include <linux/refcount.h>
 
+/**
+ * A sized, reference counted, sorted array of integers representing CPU
+ * numbers. This is commonly used to capture which CPUs a PMU is associated
+ * with. The indices into the cpumap are frequently used as they avoid having
+ * gaps if CPU numbers were used. For events associated with a pid, rather than
+ * a CPU, a single dummy map with an entry of -1 is used.
+ */
 struct perf_cpu_map {
 	refcount_t	refcnt;
+	/** Length of the map array. */
 	int		nr;
+	/** The CPU values. */
 	int		map[];
 };
 
-- 
cgit v1.2.3


From ca2c9b76bc3c75ac116ef199b75e7ca4e27e7acb Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:05 -0800
Subject: perf stat: Add aggr creators that are passed a cpu

The cpu_map and index can get confused. Add variants of the cpu_map__get
routines that are passed a cpu. Make the existing cpu_map__get routines
use the new functions with a view to remove them when no longer used.

Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-3-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/cpumap.c | 79 ++++++++++++++++++++++++++++--------------------
 tools/perf/util/cpumap.h |  6 +++-
 2 files changed, 51 insertions(+), 34 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 87d3eca9b872..49fba2c53822 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -128,21 +128,23 @@ int cpu_map__get_socket_id(int cpu)
 	return ret ?: value;
 }
 
-struct aggr_cpu_id cpu_map__get_socket(struct perf_cpu_map *map, int idx,
-					void *data __maybe_unused)
+struct aggr_cpu_id cpu_map__get_socket_aggr_by_cpu(int cpu, void *data __maybe_unused)
 {
-	int cpu;
 	struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id();
 
-	if (idx > map->nr)
-		return id;
-
-	cpu = map->map[idx];
-
 	id.socket = cpu_map__get_socket_id(cpu);
 	return id;
 }
 
+struct aggr_cpu_id cpu_map__get_socket(struct perf_cpu_map *map, int idx,
+				       void *data)
+{
+	if (idx < 0 || idx > map->nr)
+		return cpu_map__empty_aggr_cpu_id();
+
+	return cpu_map__get_socket_aggr_by_cpu(map->map[idx], data);
+}
+
 static int cmp_aggr_cpu_id(const void *a_pointer, const void *b_pointer)
 {
 	struct aggr_cpu_id *a = (struct aggr_cpu_id *)a_pointer;
@@ -200,15 +202,10 @@ int cpu_map__get_die_id(int cpu)
 	return ret ?: value;
 }
 
-struct aggr_cpu_id cpu_map__get_die(struct perf_cpu_map *map, int idx, void *data)
+struct aggr_cpu_id cpu_map__get_die_aggr_by_cpu(int cpu, void *data)
 {
-	int cpu, die;
-	struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id();
-
-	if (idx > map->nr)
-		return id;
-
-	cpu = map->map[idx];
+	struct aggr_cpu_id id;
+	int die;
 
 	die = cpu_map__get_die_id(cpu);
 	/* There is no die_id on legacy system. */
@@ -220,7 +217,7 @@ struct aggr_cpu_id cpu_map__get_die(struct perf_cpu_map *map, int idx, void *dat
 	 * with the socket ID and then add die to
 	 * make a unique ID.
 	 */
-	id = cpu_map__get_socket(map, idx, data);
+	id = cpu_map__get_socket_aggr_by_cpu(cpu, data);
 	if (cpu_map__aggr_cpu_id_is_empty(id))
 		return id;
 
@@ -228,6 +225,15 @@ struct aggr_cpu_id cpu_map__get_die(struct perf_cpu_map *map, int idx, void *dat
 	return id;
 }
 
+struct aggr_cpu_id cpu_map__get_die(struct perf_cpu_map *map, int idx,
+				    void *data)
+{
+	if (idx < 0 || idx > map->nr)
+		return cpu_map__empty_aggr_cpu_id();
+
+	return cpu_map__get_die_aggr_by_cpu(map->map[idx], data);
+}
+
 int cpu_map__get_core_id(int cpu)
 {
 	int value, ret = cpu__get_topology_int(cpu, "core_id", &value);
@@ -239,20 +245,13 @@ int cpu_map__get_node_id(int cpu)
 	return cpu__get_node(cpu);
 }
 
-struct aggr_cpu_id cpu_map__get_core(struct perf_cpu_map *map, int idx, void *data)
+struct aggr_cpu_id cpu_map__get_core_aggr_by_cpu(int cpu, void *data)
 {
-	int cpu;
-	struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id();
-
-	if (idx > map->nr)
-		return id;
-
-	cpu = map->map[idx];
-
-	cpu = cpu_map__get_core_id(cpu);
+	struct aggr_cpu_id id;
+	int core = cpu_map__get_core_id(cpu);
 
 	/* cpu_map__get_die returns a struct with socket and die set*/
-	id = cpu_map__get_die(map, idx, data);
+	id = cpu_map__get_die_aggr_by_cpu(cpu, data);
 	if (cpu_map__aggr_cpu_id_is_empty(id))
 		return id;
 
@@ -260,19 +259,33 @@ struct aggr_cpu_id cpu_map__get_core(struct perf_cpu_map *map, int idx, void *da
 	 * core_id is relative to socket and die, we need a global id.
 	 * So we combine the result from cpu_map__get_die with the core id
 	 */
-	id.core = cpu;
+	id.core = core;
 	return id;
+
 }
 
-struct aggr_cpu_id cpu_map__get_node(struct perf_cpu_map *map, int idx, void *data __maybe_unused)
+struct aggr_cpu_id cpu_map__get_core(struct perf_cpu_map *map, int idx, void *data)
+{
+	if (idx < 0 || idx > map->nr)
+		return cpu_map__empty_aggr_cpu_id();
+
+	return cpu_map__get_core_aggr_by_cpu(map->map[idx], data);
+}
+
+struct aggr_cpu_id cpu_map__get_node_aggr_by_cpu(int cpu, void *data __maybe_unused)
 {
 	struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id();
 
+	id.node = cpu_map__get_node_id(cpu);
+	return id;
+}
+
+struct aggr_cpu_id cpu_map__get_node(struct perf_cpu_map *map, int idx, void *data)
+{
 	if (idx < 0 || idx >= map->nr)
-		return id;
+		return cpu_map__empty_aggr_cpu_id();
 
-	id.node = cpu_map__get_node_id(map->map[idx]);
-	return id;
+	return cpu_map__get_node_aggr_by_cpu(map->map[idx], data);
 }
 
 int cpu_map__build_socket_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **sockp)
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index a27eeaf086e8..c62d67704425 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -31,13 +31,17 @@ size_t cpu_map__snprint(struct perf_cpu_map *map, char *buf, size_t size);
 size_t cpu_map__snprint_mask(struct perf_cpu_map *map, char *buf, size_t size);
 size_t cpu_map__fprintf(struct perf_cpu_map *map, FILE *fp);
 int cpu_map__get_socket_id(int cpu);
+struct aggr_cpu_id cpu_map__get_socket_aggr_by_cpu(int cpu, void *data);
 struct aggr_cpu_id cpu_map__get_socket(struct perf_cpu_map *map, int idx, void *data);
 int cpu_map__get_die_id(int cpu);
+struct aggr_cpu_id cpu_map__get_die_aggr_by_cpu(int cpu, void *data);
 struct aggr_cpu_id cpu_map__get_die(struct perf_cpu_map *map, int idx, void *data);
 int cpu_map__get_core_id(int cpu);
+struct aggr_cpu_id cpu_map__get_core_aggr_by_cpu(int cpu, void *data);
 struct aggr_cpu_id cpu_map__get_core(struct perf_cpu_map *map, int idx, void *data);
 int cpu_map__get_node_id(int cpu);
-struct aggr_cpu_id  cpu_map__get_node(struct perf_cpu_map *map, int idx, void *data);
+struct aggr_cpu_id cpu_map__get_node_aggr_by_cpu(int cpu, void *data);
+struct aggr_cpu_id cpu_map__get_node(struct perf_cpu_map *map, int idx, void *data);
 int cpu_map__build_socket_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **sockp);
 int cpu_map__build_die_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **diep);
 int cpu_map__build_core_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **corep);
-- 
cgit v1.2.3


From 01843ca0197783d0951a1948ebeaaed9a47ce55d Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:06 -0800
Subject: perf stat: Correct aggregation CPU map

Switch the perf_cpu_map in aggr_update_shadow from
the evlist to the counter's cpu map, so the index is appropriate. This
addresses a problem where uncore counts, with a cpumap like:
$ cat /sys/devices/uncore_imc_0/cpumask
0,18
Don't aggregate counts in CPUs based on the index of those values in the
cpumap (0 and 1) but on the actual CPU (0 and 18). Thereby correcting
metric calculations in per-socket mode for counters without a full
cpumask.

On a SkylakeX with a tweaked DRAM_BW_Use metric, to remove unnecessary
scaling, this gives:

Before:
$ /perf stat --per-socket -M DRAM_BW_Use -I 1000
     1.001102293 S0        1              27.01 MiB  uncore_imc/cas_count_write/ #   103.00 DRAM_BW_Use
     1.001102293 S0        1              30.22 MiB  uncore_imc/cas_count_read/
     1.001102293 S0        1      1,001,102,293 ns   duration_time
     1.001102293 S1        1              20.10 MiB  uncore_imc/cas_count_write/ #     0.00 DRAM_BW_Use
     1.001102293 S1        1              32.74 MiB  uncore_imc/cas_count_read/
     1.001102293 S1        0      <not counted> ns   duration_time
     2.003517973 S0        1              83.04 MiB  uncore_imc/cas_count_write/ #   920.00 DRAM_BW_Use
     2.003517973 S0        1             145.95 MiB  uncore_imc/cas_count_read/
     2.003517973 S0        1      1,002,415,680 ns   duration_time
     2.003517973 S1        1             302.45 MiB  uncore_imc/cas_count_write/ #     0.00 DRAM_BW_Use
     2.003517973 S1        1             290.99 MiB  uncore_imc/cas_count_read/
     2.003517973 S1        0      <not counted> ns   duration_time

After:
$ perf stat --per-socket -M DRAM_BW_Use -I 1000
     1.001080840 S0        1              24.96 MiB  uncore_imc/cas_count_write/ #    54.00 DRAM_BW_Use
     1.001080840 S0        1              33.64 MiB  uncore_imc/cas_count_read/
     1.001080840 S0        1      1,001,080,840 ns   duration_time
     1.001080840 S1        1              42.43 MiB  uncore_imc/cas_count_write/ #    84.00 DRAM_BW_Use
     1.001080840 S1        1              47.05 MiB  uncore_imc/cas_count_read/
     1.001080840 S1        0      <not counted> ns   duration_time

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: John Garry <john.garry@huawei.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-4-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/stat-display.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index 588601000f3f..b0fa81ffce61 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -526,7 +526,7 @@ static void aggr_update_shadow(struct perf_stat_config *config,
 		evlist__for_each_entry(evlist, counter) {
 			val = 0;
 			for (cpu = 0; cpu < evsel__nr_cpus(counter); cpu++) {
-				s2 = config->aggr_get_id(config, evlist->core.cpus, cpu);
+				s2 = config->aggr_get_id(config, evsel__cpus(counter), cpu);
 				if (!cpu_map__compare_aggr_cpu_id(s2, id))
 					continue;
 				val += perf_counts(counter->counts, cpu, 0)->val;
-- 
cgit v1.2.3


From a023283fadef8a3f6916ba2b0c37955d76ffaf4d Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:07 -0800
Subject: perf stat: Switch aggregation to use for_each loop

Tidy up the use of cpu and index to hopefully make the code less error
prone. Avoid unused warnings with (void) which will be removed in a
later patch.

Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-5-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/stat-display.c | 48 ++++++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 21 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index b0fa81ffce61..efab39a759ff 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -330,8 +330,8 @@ static void print_metric_header(struct perf_stat_config *config,
 static int first_shadow_cpu(struct perf_stat_config *config,
 			    struct evsel *evsel, struct aggr_cpu_id id)
 {
-	struct evlist *evlist = evsel->evlist;
-	int i;
+	struct perf_cpu_map *cpus;
+	int cpu, idx;
 
 	if (config->aggr_mode == AGGR_NONE)
 		return id.core;
@@ -339,14 +339,11 @@ static int first_shadow_cpu(struct perf_stat_config *config,
 	if (!config->aggr_get_id)
 		return 0;
 
-	for (i = 0; i < evsel__nr_cpus(evsel); i++) {
-		int cpu2 = evsel__cpus(evsel)->map[i];
-
-		if (cpu_map__compare_aggr_cpu_id(
-					config->aggr_get_id(config, evlist->core.cpus, cpu2),
-					id)) {
-			return cpu2;
-		}
+	cpus = evsel__cpus(evsel);
+	perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
+		if (cpu_map__compare_aggr_cpu_id(config->aggr_get_id(config, cpus, idx),
+						 id))
+			return cpu;
 	}
 	return 0;
 }
@@ -516,20 +513,23 @@ static void printout(struct perf_stat_config *config, struct aggr_cpu_id id, int
 static void aggr_update_shadow(struct perf_stat_config *config,
 			       struct evlist *evlist)
 {
-	int cpu, s;
+	int cpu, idx, s;
 	struct aggr_cpu_id s2, id;
 	u64 val;
 	struct evsel *counter;
+	struct perf_cpu_map *cpus;
 
 	for (s = 0; s < config->aggr_map->nr; s++) {
 		id = config->aggr_map->map[s];
 		evlist__for_each_entry(evlist, counter) {
+			cpus = evsel__cpus(counter);
 			val = 0;
-			for (cpu = 0; cpu < evsel__nr_cpus(counter); cpu++) {
-				s2 = config->aggr_get_id(config, evsel__cpus(counter), cpu);
+			perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
+				(void)cpu;
+				s2 = config->aggr_get_id(config, cpus, idx);
 				if (!cpu_map__compare_aggr_cpu_id(s2, id))
 					continue;
-				val += perf_counts(counter->counts, cpu, 0)->val;
+				val += perf_counts(counter->counts, idx, 0)->val;
 			}
 			perf_stat__update_shadow_stats(counter, val,
 					first_shadow_cpu(config, counter, id),
@@ -634,18 +634,21 @@ static void aggr_cb(struct perf_stat_config *config,
 		    struct evsel *counter, void *data, bool first)
 {
 	struct aggr_data *ad = data;
-	int cpu;
+	int idx, cpu;
+	struct perf_cpu_map *cpus;
 	struct aggr_cpu_id s2;
 
-	for (cpu = 0; cpu < evsel__nr_cpus(counter); cpu++) {
+	cpus = evsel__cpus(counter);
+	perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
 		struct perf_counts_values *counts;
 
-		s2 = config->aggr_get_id(config, evsel__cpus(counter), cpu);
+		(void)cpu;
+		s2 = config->aggr_get_id(config, cpus, idx);
 		if (!cpu_map__compare_aggr_cpu_id(s2, ad->id))
 			continue;
 		if (first)
 			ad->nr++;
-		counts = perf_counts(counter->counts, cpu, 0);
+		counts = perf_counts(counter->counts, idx, 0);
 		/*
 		 * When any result is bad, make them all to give
 		 * consistent output in interval mode.
@@ -1208,10 +1211,13 @@ static void print_percore_thread(struct perf_stat_config *config,
 {
 	int s;
 	struct aggr_cpu_id s2, id;
+	struct perf_cpu_map *cpus;
 	bool first = true;
+	int idx, cpu;
 
-	for (int i = 0; i < evsel__nr_cpus(counter); i++) {
-		s2 = config->aggr_get_id(config, evsel__cpus(counter), i);
+	cpus = evsel__cpus(counter);
+	perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
+		s2 = config->aggr_get_id(config, cpus, idx);
 		for (s = 0; s < config->aggr_map->nr; s++) {
 			id = config->aggr_map->map[s];
 			if (cpu_map__compare_aggr_cpu_id(s2, id))
@@ -1220,7 +1226,7 @@ static void print_percore_thread(struct perf_stat_config *config,
 
 		print_counter_aggrdata(config, counter, s,
 				       prefix, false,
-				       &first, i);
+				       &first, cpu);
 	}
 }
 
-- 
cgit v1.2.3


From 88031a0de7d68d132014154b9e5307428e8ed70d Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:08 -0800
Subject: perf stat: Switch to cpu version of cpu_map__get()

Avoid possible bugs where the wrong index is passed with the cpu_map.

Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-6-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c      | 93 +++++++++++++++++++++++-------------------
 tools/perf/util/stat-display.c | 11 ++---
 tools/perf/util/stat.h         |  3 +-
 3 files changed, 57 insertions(+), 50 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index f6ca2b054c5b..9791ae9b1a53 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1323,69 +1323,63 @@ static struct option stat_options[] = {
 };
 
 static struct aggr_cpu_id perf_stat__get_socket(struct perf_stat_config *config __maybe_unused,
-				 struct perf_cpu_map *map, int cpu)
+						int cpu)
 {
-	return cpu_map__get_socket(map, cpu, NULL);
+	return cpu_map__get_socket_aggr_by_cpu(cpu, /*data=*/NULL);
 }
 
 static struct aggr_cpu_id perf_stat__get_die(struct perf_stat_config *config __maybe_unused,
-			      struct perf_cpu_map *map, int cpu)
+					     int cpu)
 {
-	return cpu_map__get_die(map, cpu, NULL);
+	return cpu_map__get_die_aggr_by_cpu(cpu, /*data=*/NULL);
 }
 
 static struct aggr_cpu_id perf_stat__get_core(struct perf_stat_config *config __maybe_unused,
-			       struct perf_cpu_map *map, int cpu)
+					      int cpu)
 {
-	return cpu_map__get_core(map, cpu, NULL);
+	return cpu_map__get_core_aggr_by_cpu(cpu, /*data=*/NULL);
 }
 
 static struct aggr_cpu_id perf_stat__get_node(struct perf_stat_config *config __maybe_unused,
-			       struct perf_cpu_map *map, int cpu)
+					      int cpu)
 {
-	return cpu_map__get_node(map, cpu, NULL);
+	return cpu_map__get_node_aggr_by_cpu(cpu, /*data=*/NULL);
 }
 
 static struct aggr_cpu_id perf_stat__get_aggr(struct perf_stat_config *config,
-			       aggr_get_id_t get_id, struct perf_cpu_map *map, int idx)
+					      aggr_get_id_t get_id, int cpu)
 {
-	int cpu;
 	struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id();
 
-	if (idx >= map->nr)
-		return id;
-
-	cpu = map->map[idx];
-
 	if (cpu_map__aggr_cpu_id_is_empty(config->cpus_aggr_map->map[cpu]))
-		config->cpus_aggr_map->map[cpu] = get_id(config, map, idx);
+		config->cpus_aggr_map->map[cpu] = get_id(config, cpu);
 
 	id = config->cpus_aggr_map->map[cpu];
 	return id;
 }
 
 static struct aggr_cpu_id perf_stat__get_socket_cached(struct perf_stat_config *config,
-					struct perf_cpu_map *map, int idx)
+						       int cpu)
 {
-	return perf_stat__get_aggr(config, perf_stat__get_socket, map, idx);
+	return perf_stat__get_aggr(config, perf_stat__get_socket, cpu);
 }
 
 static struct aggr_cpu_id perf_stat__get_die_cached(struct perf_stat_config *config,
-					struct perf_cpu_map *map, int idx)
+						    int cpu)
 {
-	return perf_stat__get_aggr(config, perf_stat__get_die, map, idx);
+	return perf_stat__get_aggr(config, perf_stat__get_die, cpu);
 }
 
 static struct aggr_cpu_id perf_stat__get_core_cached(struct perf_stat_config *config,
-				      struct perf_cpu_map *map, int idx)
+						     int cpu)
 {
-	return perf_stat__get_aggr(config, perf_stat__get_core, map, idx);
+	return perf_stat__get_aggr(config, perf_stat__get_core, cpu);
 }
 
 static struct aggr_cpu_id perf_stat__get_node_cached(struct perf_stat_config *config,
-				      struct perf_cpu_map *map, int idx)
+						     int cpu)
 {
-	return perf_stat__get_aggr(config, perf_stat__get_node, map, idx);
+	return perf_stat__get_aggr(config, perf_stat__get_node, cpu);
 }
 
 static bool term_percore_set(void)
@@ -1483,8 +1477,9 @@ static void perf_stat__exit_aggr_mode(void)
 	stat_config.cpus_aggr_map = NULL;
 }
 
-static inline int perf_env__get_cpu(struct perf_env *env, struct perf_cpu_map *map, int idx)
+static inline int perf_env__get_cpu(void *data, struct perf_cpu_map *map, int idx)
 {
+	struct perf_env *env = data;
 	int cpu;
 
 	if (idx > map->nr)
@@ -1498,10 +1493,9 @@ static inline int perf_env__get_cpu(struct perf_env *env, struct perf_cpu_map *m
 	return cpu;
 }
 
-static struct aggr_cpu_id perf_env__get_socket(struct perf_cpu_map *map, int idx, void *data)
+static struct aggr_cpu_id perf_env__get_socket_aggr_by_cpu(int cpu, void *data)
 {
 	struct perf_env *env = data;
-	int cpu = perf_env__get_cpu(env, map, idx);
 	struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id();
 
 	if (cpu != -1)
@@ -1510,11 +1504,15 @@ static struct aggr_cpu_id perf_env__get_socket(struct perf_cpu_map *map, int idx
 	return id;
 }
 
-static struct aggr_cpu_id perf_env__get_die(struct perf_cpu_map *map, int idx, void *data)
+static struct aggr_cpu_id perf_env__get_socket(struct perf_cpu_map *map, int idx, void *data)
+{
+	return perf_env__get_socket_aggr_by_cpu(perf_env__get_cpu(data, map, idx), data);
+}
+
+static struct aggr_cpu_id perf_env__get_die_aggr_by_cpu(int cpu, void *data)
 {
 	struct perf_env *env = data;
 	struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id();
-	int cpu = perf_env__get_cpu(env, map, idx);
 
 	if (cpu != -1) {
 		/*
@@ -1529,11 +1527,15 @@ static struct aggr_cpu_id perf_env__get_die(struct perf_cpu_map *map, int idx, v
 	return id;
 }
 
-static struct aggr_cpu_id perf_env__get_core(struct perf_cpu_map *map, int idx, void *data)
+static struct aggr_cpu_id perf_env__get_die(struct perf_cpu_map *map, int idx, void *data)
+{
+	return perf_env__get_die_aggr_by_cpu(perf_env__get_cpu(data, map, idx), data);
+}
+
+static struct aggr_cpu_id perf_env__get_core_aggr_by_cpu(int cpu, void *data)
 {
 	struct perf_env *env = data;
 	struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id();
-	int cpu = perf_env__get_cpu(env, map, idx);
 
 	if (cpu != -1) {
 		/*
@@ -1549,15 +1551,24 @@ static struct aggr_cpu_id perf_env__get_core(struct perf_cpu_map *map, int idx,
 	return id;
 }
 
-static struct aggr_cpu_id perf_env__get_node(struct perf_cpu_map *map, int idx, void *data)
+static struct aggr_cpu_id perf_env__get_core(struct perf_cpu_map *map, int idx, void *data)
+{
+	return perf_env__get_core_aggr_by_cpu(perf_env__get_cpu(data, map, idx), data);
+}
+
+static struct aggr_cpu_id perf_env__get_node_aggr_by_cpu(int cpu, void *data)
 {
-	int cpu = perf_env__get_cpu(data, map, idx);
 	struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id();
 
 	id.node = perf_env__numa_node(data, cpu);
 	return id;
 }
 
+static struct aggr_cpu_id perf_env__get_node(struct perf_cpu_map *map, int idx, void *data)
+{
+	return perf_env__get_node_aggr_by_cpu(perf_env__get_cpu(data, map, idx), data);
+}
+
 static int perf_env__build_socket_map(struct perf_env *env, struct perf_cpu_map *cpus,
 				      struct cpu_aggr_map **sockp)
 {
@@ -1583,26 +1594,26 @@ static int perf_env__build_node_map(struct perf_env *env, struct perf_cpu_map *c
 }
 
 static struct aggr_cpu_id perf_stat__get_socket_file(struct perf_stat_config *config __maybe_unused,
-				      struct perf_cpu_map *map, int idx)
+						     int cpu)
 {
-	return perf_env__get_socket(map, idx, &perf_stat.session->header.env);
+	return perf_env__get_socket_aggr_by_cpu(cpu, &perf_stat.session->header.env);
 }
 static struct aggr_cpu_id perf_stat__get_die_file(struct perf_stat_config *config __maybe_unused,
-				   struct perf_cpu_map *map, int idx)
+						  int cpu)
 {
-	return perf_env__get_die(map, idx, &perf_stat.session->header.env);
+	return perf_env__get_die_aggr_by_cpu(cpu, &perf_stat.session->header.env);
 }
 
 static struct aggr_cpu_id perf_stat__get_core_file(struct perf_stat_config *config __maybe_unused,
-				    struct perf_cpu_map *map, int idx)
+						   int cpu)
 {
-	return perf_env__get_core(map, idx, &perf_stat.session->header.env);
+	return perf_env__get_core_aggr_by_cpu(cpu, &perf_stat.session->header.env);
 }
 
 static struct aggr_cpu_id perf_stat__get_node_file(struct perf_stat_config *config __maybe_unused,
-				    struct perf_cpu_map *map, int idx)
+						   int cpu)
 {
-	return perf_env__get_node(map, idx, &perf_stat.session->header.env);
+	return perf_env__get_node_aggr_by_cpu(cpu, &perf_stat.session->header.env);
 }
 
 static int perf_stat_init_aggr_mode_file(struct perf_stat *st)
diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index efab39a759ff..6c40b91d5e32 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -341,8 +341,7 @@ static int first_shadow_cpu(struct perf_stat_config *config,
 
 	cpus = evsel__cpus(evsel);
 	perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
-		if (cpu_map__compare_aggr_cpu_id(config->aggr_get_id(config, cpus, idx),
-						 id))
+		if (cpu_map__compare_aggr_cpu_id(config->aggr_get_id(config, cpu), id))
 			return cpu;
 	}
 	return 0;
@@ -525,8 +524,7 @@ static void aggr_update_shadow(struct perf_stat_config *config,
 			cpus = evsel__cpus(counter);
 			val = 0;
 			perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
-				(void)cpu;
-				s2 = config->aggr_get_id(config, cpus, idx);
+				s2 = config->aggr_get_id(config, cpu);
 				if (!cpu_map__compare_aggr_cpu_id(s2, id))
 					continue;
 				val += perf_counts(counter->counts, idx, 0)->val;
@@ -642,8 +640,7 @@ static void aggr_cb(struct perf_stat_config *config,
 	perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
 		struct perf_counts_values *counts;
 
-		(void)cpu;
-		s2 = config->aggr_get_id(config, cpus, idx);
+		s2 = config->aggr_get_id(config, cpu);
 		if (!cpu_map__compare_aggr_cpu_id(s2, ad->id))
 			continue;
 		if (first)
@@ -1217,7 +1214,7 @@ static void print_percore_thread(struct perf_stat_config *config,
 
 	cpus = evsel__cpus(counter);
 	perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
-		s2 = config->aggr_get_id(config, cpus, idx);
+		s2 = config->aggr_get_id(config, cpu);
 		for (s = 0; s < config->aggr_map->nr; s++) {
 			id = config->aggr_map->map[s];
 			if (cpu_map__compare_aggr_cpu_id(s2, id))
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index 32c8527de347..32cf24186229 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -108,8 +108,7 @@ struct runtime_stat {
 	struct rblist value_list;
 };
 
-typedef struct aggr_cpu_id (*aggr_get_id_t)(struct perf_stat_config *config,
-			     struct perf_cpu_map *m, int cpu);
+typedef struct aggr_cpu_id (*aggr_get_id_t)(struct perf_stat_config *config, int cpu);
 
 struct perf_stat_config {
 	enum aggr_mode		 aggr_mode;
-- 
cgit v1.2.3


From eff54c24bb147afc0a1423b49bfa1b8eaa85a88f Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:09 -0800
Subject: perf cpumap: Switch cpu_map__build_map() to cpu function

Avoid error prone cpu_map + idx variant. Remove now unused functions.

Committer notes:

Remove by now unused perf_env__get_cpu().

Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-7-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 44 ++++----------------------------------------
 tools/perf/util/cpumap.c  | 12 ++++++------
 tools/perf/util/cpumap.h  |  2 +-
 3 files changed, 11 insertions(+), 47 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 9791ae9b1a53..40cb3518f27e 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1477,22 +1477,6 @@ static void perf_stat__exit_aggr_mode(void)
 	stat_config.cpus_aggr_map = NULL;
 }
 
-static inline int perf_env__get_cpu(void *data, struct perf_cpu_map *map, int idx)
-{
-	struct perf_env *env = data;
-	int cpu;
-
-	if (idx > map->nr)
-		return -1;
-
-	cpu = map->map[idx];
-
-	if (cpu >= env->nr_cpus_avail)
-		return -1;
-
-	return cpu;
-}
-
 static struct aggr_cpu_id perf_env__get_socket_aggr_by_cpu(int cpu, void *data)
 {
 	struct perf_env *env = data;
@@ -1504,11 +1488,6 @@ static struct aggr_cpu_id perf_env__get_socket_aggr_by_cpu(int cpu, void *data)
 	return id;
 }
 
-static struct aggr_cpu_id perf_env__get_socket(struct perf_cpu_map *map, int idx, void *data)
-{
-	return perf_env__get_socket_aggr_by_cpu(perf_env__get_cpu(data, map, idx), data);
-}
-
 static struct aggr_cpu_id perf_env__get_die_aggr_by_cpu(int cpu, void *data)
 {
 	struct perf_env *env = data;
@@ -1527,11 +1506,6 @@ static struct aggr_cpu_id perf_env__get_die_aggr_by_cpu(int cpu, void *data)
 	return id;
 }
 
-static struct aggr_cpu_id perf_env__get_die(struct perf_cpu_map *map, int idx, void *data)
-{
-	return perf_env__get_die_aggr_by_cpu(perf_env__get_cpu(data, map, idx), data);
-}
-
 static struct aggr_cpu_id perf_env__get_core_aggr_by_cpu(int cpu, void *data)
 {
 	struct perf_env *env = data;
@@ -1551,11 +1525,6 @@ static struct aggr_cpu_id perf_env__get_core_aggr_by_cpu(int cpu, void *data)
 	return id;
 }
 
-static struct aggr_cpu_id perf_env__get_core(struct perf_cpu_map *map, int idx, void *data)
-{
-	return perf_env__get_core_aggr_by_cpu(perf_env__get_cpu(data, map, idx), data);
-}
-
 static struct aggr_cpu_id perf_env__get_node_aggr_by_cpu(int cpu, void *data)
 {
 	struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id();
@@ -1564,33 +1533,28 @@ static struct aggr_cpu_id perf_env__get_node_aggr_by_cpu(int cpu, void *data)
 	return id;
 }
 
-static struct aggr_cpu_id perf_env__get_node(struct perf_cpu_map *map, int idx, void *data)
-{
-	return perf_env__get_node_aggr_by_cpu(perf_env__get_cpu(data, map, idx), data);
-}
-
 static int perf_env__build_socket_map(struct perf_env *env, struct perf_cpu_map *cpus,
 				      struct cpu_aggr_map **sockp)
 {
-	return cpu_map__build_map(cpus, sockp, perf_env__get_socket, env);
+	return cpu_map__build_map(cpus, sockp, perf_env__get_socket_aggr_by_cpu, env);
 }
 
 static int perf_env__build_die_map(struct perf_env *env, struct perf_cpu_map *cpus,
 				   struct cpu_aggr_map **diep)
 {
-	return cpu_map__build_map(cpus, diep, perf_env__get_die, env);
+	return cpu_map__build_map(cpus, diep, perf_env__get_die_aggr_by_cpu, env);
 }
 
 static int perf_env__build_core_map(struct perf_env *env, struct perf_cpu_map *cpus,
 				    struct cpu_aggr_map **corep)
 {
-	return cpu_map__build_map(cpus, corep, perf_env__get_core, env);
+	return cpu_map__build_map(cpus, corep, perf_env__get_core_aggr_by_cpu, env);
 }
 
 static int perf_env__build_node_map(struct perf_env *env, struct perf_cpu_map *cpus,
 				    struct cpu_aggr_map **nodep)
 {
-	return cpu_map__build_map(cpus, nodep, perf_env__get_node, env);
+	return cpu_map__build_map(cpus, nodep, perf_env__get_node_aggr_by_cpu, env);
 }
 
 static struct aggr_cpu_id perf_stat__get_socket_file(struct perf_stat_config *config __maybe_unused,
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 49fba2c53822..feaf34b25efc 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -163,7 +163,7 @@ static int cmp_aggr_cpu_id(const void *a_pointer, const void *b_pointer)
 }
 
 int cpu_map__build_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **res,
-		       struct aggr_cpu_id (*f)(struct perf_cpu_map *map, int cpu, void *data),
+		       struct aggr_cpu_id (*f)(int cpu, void *data),
 		       void *data)
 {
 	int nr = cpus->nr;
@@ -178,7 +178,7 @@ int cpu_map__build_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **res,
 	c->nr = 0;
 
 	for (cpu = 0; cpu < nr; cpu++) {
-		s1 = f(cpus, cpu, data);
+		s1 = f(cpu, data);
 		for (s2 = 0; s2 < c->nr; s2++) {
 			if (cpu_map__compare_aggr_cpu_id(s1, c->map[s2]))
 				break;
@@ -290,22 +290,22 @@ struct aggr_cpu_id cpu_map__get_node(struct perf_cpu_map *map, int idx, void *da
 
 int cpu_map__build_socket_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **sockp)
 {
-	return cpu_map__build_map(cpus, sockp, cpu_map__get_socket, NULL);
+	return cpu_map__build_map(cpus, sockp, cpu_map__get_socket_aggr_by_cpu, NULL);
 }
 
 int cpu_map__build_die_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **diep)
 {
-	return cpu_map__build_map(cpus, diep, cpu_map__get_die, NULL);
+	return cpu_map__build_map(cpus, diep, cpu_map__get_die_aggr_by_cpu, NULL);
 }
 
 int cpu_map__build_core_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **corep)
 {
-	return cpu_map__build_map(cpus, corep, cpu_map__get_core, NULL);
+	return cpu_map__build_map(cpus, corep, cpu_map__get_core_aggr_by_cpu, NULL);
 }
 
 int cpu_map__build_node_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **numap)
 {
-	return cpu_map__build_map(cpus, numap, cpu_map__get_node, NULL);
+	return cpu_map__build_map(cpus, numap, cpu_map__get_node_aggr_by_cpu, NULL);
 }
 
 /* setup simple routines to easily access node numbers given a cpu number */
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index c62d67704425..9648816c4255 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -63,7 +63,7 @@ int cpu__max_present_cpu(void);
 int cpu__get_node(int cpu);
 
 int cpu_map__build_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **res,
-		       struct aggr_cpu_id (*f)(struct perf_cpu_map *map, int cpu, void *data),
+		       struct aggr_cpu_id (*f)(int cpu, void *data),
 		       void *data);
 
 int cpu_map__cpu(struct perf_cpu_map *cpus, int idx);
-- 
cgit v1.2.3


From 448a69d9f34d02920cffba741ca0a2e34a5bb316 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:10 -0800
Subject: perf cpumap: Remove map+index get_socket()

Migrate final users to appropriate cpu variant.

Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-8-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/topology.c | 2 +-
 tools/perf/util/cpumap.c    | 9 ---------
 tools/perf/util/cpumap.h    | 1 -
 tools/perf/util/stat.c      | 2 +-
 4 files changed, 2 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 869986139146..69a64074b897 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -150,7 +150,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map)
 
 	// Test that socket ID contains only socket
 	for (i = 0; i < map->nr; i++) {
-		id = cpu_map__get_socket(map, i, NULL);
+		id = cpu_map__get_socket_aggr_by_cpu(perf_cpu_map__cpu(map, i), NULL);
 		TEST_ASSERT_VAL("Socket map - Socket ID doesn't match",
 			session->header.env.cpu[map->map[i]].socket_id == id.socket);
 
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index feaf34b25efc..342a5eaee9d3 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -136,15 +136,6 @@ struct aggr_cpu_id cpu_map__get_socket_aggr_by_cpu(int cpu, void *data __maybe_u
 	return id;
 }
 
-struct aggr_cpu_id cpu_map__get_socket(struct perf_cpu_map *map, int idx,
-				       void *data)
-{
-	if (idx < 0 || idx > map->nr)
-		return cpu_map__empty_aggr_cpu_id();
-
-	return cpu_map__get_socket_aggr_by_cpu(map->map[idx], data);
-}
-
 static int cmp_aggr_cpu_id(const void *a_pointer, const void *b_pointer)
 {
 	struct aggr_cpu_id *a = (struct aggr_cpu_id *)a_pointer;
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 9648816c4255..a53af24301d2 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -32,7 +32,6 @@ size_t cpu_map__snprint_mask(struct perf_cpu_map *map, char *buf, size_t size);
 size_t cpu_map__fprintf(struct perf_cpu_map *map, FILE *fp);
 int cpu_map__get_socket_id(int cpu);
 struct aggr_cpu_id cpu_map__get_socket_aggr_by_cpu(int cpu, void *data);
-struct aggr_cpu_id cpu_map__get_socket(struct perf_cpu_map *map, int idx, void *data);
 int cpu_map__get_die_id(int cpu);
 struct aggr_cpu_id cpu_map__get_die_aggr_by_cpu(int cpu, void *data);
 struct aggr_cpu_id cpu_map__get_die(struct perf_cpu_map *map, int idx, void *data);
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index 09ea334586f2..9eca1111fa52 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -328,7 +328,7 @@ static int check_per_pkg(struct evsel *counter,
 	if (!(vals->run && vals->ena))
 		return 0;
 
-	s = cpu_map__get_socket(cpus, cpu, NULL).socket;
+	s = cpu_map__get_socket_id(cpu);
 	if (s < 0)
 		return -1;
 
-- 
cgit v1.2.3


From 1cdae3d6734779a637bc4e6ec24e7f615b4e71be Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:11 -0800
Subject: perf cpumap: Remove map+index get_die()

Migrate final users to appropriate cpu variant.

Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-9-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/topology.c | 2 +-
 tools/perf/util/cpumap.c    | 9 ---------
 tools/perf/util/cpumap.h    | 1 -
 tools/perf/util/stat.c      | 2 +-
 4 files changed, 2 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 69a64074b897..ce085b6f379b 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -136,7 +136,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map)
 
 	// Test that die ID contains socket and die
 	for (i = 0; i < map->nr; i++) {
-		id = cpu_map__get_die(map, i, NULL);
+		id = cpu_map__get_die_aggr_by_cpu(perf_cpu_map__cpu(map, i), NULL);
 		TEST_ASSERT_VAL("Die map - Socket ID doesn't match",
 			session->header.env.cpu[map->map[i]].socket_id == id.socket);
 
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 342a5eaee9d3..ff91c32da688 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -216,15 +216,6 @@ struct aggr_cpu_id cpu_map__get_die_aggr_by_cpu(int cpu, void *data)
 	return id;
 }
 
-struct aggr_cpu_id cpu_map__get_die(struct perf_cpu_map *map, int idx,
-				    void *data)
-{
-	if (idx < 0 || idx > map->nr)
-		return cpu_map__empty_aggr_cpu_id();
-
-	return cpu_map__get_die_aggr_by_cpu(map->map[idx], data);
-}
-
 int cpu_map__get_core_id(int cpu)
 {
 	int value, ret = cpu__get_topology_int(cpu, "core_id", &value);
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index a53af24301d2..365ed69699e1 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -34,7 +34,6 @@ int cpu_map__get_socket_id(int cpu);
 struct aggr_cpu_id cpu_map__get_socket_aggr_by_cpu(int cpu, void *data);
 int cpu_map__get_die_id(int cpu);
 struct aggr_cpu_id cpu_map__get_die_aggr_by_cpu(int cpu, void *data);
-struct aggr_cpu_id cpu_map__get_die(struct perf_cpu_map *map, int idx, void *data);
 int cpu_map__get_core_id(int cpu);
 struct aggr_cpu_id cpu_map__get_core_aggr_by_cpu(int cpu, void *data);
 struct aggr_cpu_id cpu_map__get_core(struct perf_cpu_map *map, int idx, void *data);
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index 9eca1111fa52..5ed99bcfe91e 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -336,7 +336,7 @@ static int check_per_pkg(struct evsel *counter,
 	 * On multi-die system, die_id > 0. On no-die system, die_id = 0.
 	 * We use hashmap(socket, die) to check the used socket+die pair.
 	 */
-	d = cpu_map__get_die(cpus, cpu, NULL).die;
+	d = cpu_map__get_die_id(cpu);
 	if (d < 0)
 		return -1;
 
-- 
cgit v1.2.3


From 3f6233dc7798044637426ae1099d88aa375c467f Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:12 -0800
Subject: perf cpumap: Remove map+index get_core()

Migrate final users to appropriate cpu variant.

Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-10-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/topology.c | 2 +-
 tools/perf/util/cpumap.c    | 8 --------
 tools/perf/util/cpumap.h    | 1 -
 3 files changed, 1 insertion(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index ce085b6f379b..9a671670415a 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -121,7 +121,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map)
 
 	// Test that core ID contains socket, die and core
 	for (i = 0; i < map->nr; i++) {
-		id = cpu_map__get_core(map, i, NULL);
+		id = cpu_map__get_core_aggr_by_cpu(perf_cpu_map__cpu(map, i), NULL);
 		TEST_ASSERT_VAL("Core map - Core ID doesn't match",
 			session->header.env.cpu[map->map[i]].core_id == id.core);
 
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index ff91c32da688..e8149bcf8bfa 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -246,14 +246,6 @@ struct aggr_cpu_id cpu_map__get_core_aggr_by_cpu(int cpu, void *data)
 
 }
 
-struct aggr_cpu_id cpu_map__get_core(struct perf_cpu_map *map, int idx, void *data)
-{
-	if (idx < 0 || idx > map->nr)
-		return cpu_map__empty_aggr_cpu_id();
-
-	return cpu_map__get_core_aggr_by_cpu(map->map[idx], data);
-}
-
 struct aggr_cpu_id cpu_map__get_node_aggr_by_cpu(int cpu, void *data __maybe_unused)
 {
 	struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id();
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 365ed69699e1..7e1829468bd6 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -36,7 +36,6 @@ int cpu_map__get_die_id(int cpu);
 struct aggr_cpu_id cpu_map__get_die_aggr_by_cpu(int cpu, void *data);
 int cpu_map__get_core_id(int cpu);
 struct aggr_cpu_id cpu_map__get_core_aggr_by_cpu(int cpu, void *data);
-struct aggr_cpu_id cpu_map__get_core(struct perf_cpu_map *map, int idx, void *data);
 int cpu_map__get_node_id(int cpu);
 struct aggr_cpu_id cpu_map__get_node_aggr_by_cpu(int cpu, void *data);
 struct aggr_cpu_id cpu_map__get_node(struct perf_cpu_map *map, int idx, void *data);
-- 
cgit v1.2.3


From 86d94048e234c94af88a528ab4d5ef16e8a89f8a Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:13 -0800
Subject: perf cpumap: Remove map+index get_node()

Migrate final users to appropriate cpu variant.

Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-11-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/topology.c | 2 +-
 tools/perf/util/cpumap.c    | 8 --------
 tools/perf/util/cpumap.h    | 1 -
 3 files changed, 1 insertion(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 9a671670415a..5992b323c4f5 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -162,7 +162,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map)
 
 	// Test that node ID contains only node
 	for (i = 0; i < map->nr; i++) {
-		id = cpu_map__get_node(map, i, NULL);
+		id = cpu_map__get_node_aggr_by_cpu(perf_cpu_map__cpu(map, i), NULL);
 		TEST_ASSERT_VAL("Node map - Node ID doesn't match",
 			cpu__get_node(map->map[i]) == id.node);
 		TEST_ASSERT_VAL("Node map - Socket is set", id.socket == -1);
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index e8149bcf8bfa..f67b2e7aac13 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -254,14 +254,6 @@ struct aggr_cpu_id cpu_map__get_node_aggr_by_cpu(int cpu, void *data __maybe_unu
 	return id;
 }
 
-struct aggr_cpu_id cpu_map__get_node(struct perf_cpu_map *map, int idx, void *data)
-{
-	if (idx < 0 || idx >= map->nr)
-		return cpu_map__empty_aggr_cpu_id();
-
-	return cpu_map__get_node_aggr_by_cpu(map->map[idx], data);
-}
-
 int cpu_map__build_socket_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **sockp)
 {
 	return cpu_map__build_map(cpus, sockp, cpu_map__get_socket_aggr_by_cpu, NULL);
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 7e1829468bd6..f0121dd4fdcb 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -38,7 +38,6 @@ int cpu_map__get_core_id(int cpu);
 struct aggr_cpu_id cpu_map__get_core_aggr_by_cpu(int cpu, void *data);
 int cpu_map__get_node_id(int cpu);
 struct aggr_cpu_id cpu_map__get_node_aggr_by_cpu(int cpu, void *data);
-struct aggr_cpu_id cpu_map__get_node(struct perf_cpu_map *map, int idx, void *data);
 int cpu_map__build_socket_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **sockp);
 int cpu_map__build_die_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **diep);
 int cpu_map__build_core_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **corep);
-- 
cgit v1.2.3


From 49679da388f4c45b0ca444dcf8bb5f59a02f8f4e Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:14 -0800
Subject: perf cpumap: Add comments to aggr_cpu_id()

This code is already tested in topology.c.

Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-12-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/cpumap.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index f0121dd4fdcb..edd93e1db36a 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -7,11 +7,20 @@
 #include <internal/cpumap.h>
 #include <perf/cpumap.h>
 
+/** Identify where counts are aggregated, -1 implies not to aggregate. */
 struct aggr_cpu_id {
+	/** A value in the range 0 to number of threads. */
 	int thread;
+	/** The numa node X as read from /sys/devices/system/node/nodeX. */
 	int node;
+	/**
+	 * The socket number as read from
+	 * /sys/devices/system/cpu/cpuX/topology/physical_package_id.
+	 */
 	int socket;
+	/** The die id as read from /sys/devices/system/cpu/cpuX/topology/die_id. */
 	int die;
+	/** The core id as read from /sys/devices/system/cpu/cpuX/topology/core_id. */
 	int core;
 };
 
-- 
cgit v1.2.3


From 63e0fa873d8820b996a01a83d832bf1b3969e9b6 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:15 -0800
Subject: perf cpumap: Remove unused cpu_map__socket()

Unused function so remove.

Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-13-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/cpumap.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index edd93e1db36a..22e53fd54657 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -53,13 +53,6 @@ int cpu_map__build_core_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **cor
 int cpu_map__build_node_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **nodep);
 const struct perf_cpu_map *cpu_map__online(void); /* thread unsafe */
 
-static inline int cpu_map__socket(struct perf_cpu_map *sock, int s)
-{
-	if (!sock || s > sock->nr || s < 0)
-		return 0;
-	return sock->map[s];
-}
-
 int cpu__setup_cpunode_map(void);
 
 int cpu__max_node(void);
-- 
cgit v1.2.3


From 3ac23d199c2bc3bc2a2b31c803e7c5d841959670 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:16 -0800
Subject: perf cpumap: Simplify equal function name

Rename cpu_map__compare_aggr_cpu_id() to aggr_cpu_id__equal(), the
cpu_map part of the name is misleading. Equal better describes the
function than compare.

Switch to const pointer rather than value as struct given the number of
variables in aggr_cpu_id().

Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-14-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/cpumap.c       | 14 +++++++-------
 tools/perf/util/cpumap.h       |  2 +-
 tools/perf/util/stat-display.c | 18 ++++++++++--------
 3 files changed, 18 insertions(+), 16 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index f67b2e7aac13..8fa00a6221c8 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -171,7 +171,7 @@ int cpu_map__build_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **res,
 	for (cpu = 0; cpu < nr; cpu++) {
 		s1 = f(cpu, data);
 		for (s2 = 0; s2 < c->nr; s2++) {
-			if (cpu_map__compare_aggr_cpu_id(s1, c->map[s2]))
+			if (aggr_cpu_id__equal(&s1, &c->map[s2]))
 				break;
 		}
 		if (s2 == c->nr) {
@@ -593,13 +593,13 @@ const struct perf_cpu_map *cpu_map__online(void) /* thread unsafe */
 	return online;
 }
 
-bool cpu_map__compare_aggr_cpu_id(struct aggr_cpu_id a, struct aggr_cpu_id b)
+bool aggr_cpu_id__equal(const struct aggr_cpu_id *a, const struct aggr_cpu_id *b)
 {
-	return a.thread == b.thread &&
-		a.node == b.node &&
-		a.socket == b.socket &&
-		a.die == b.die &&
-		a.core == b.core;
+	return a->thread == b->thread &&
+		a->node == b->node &&
+		a->socket == b->socket &&
+		a->die == b->die &&
+		a->core == b->core;
 }
 
 bool cpu_map__aggr_cpu_id_is_empty(struct aggr_cpu_id a)
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 22e53fd54657..652b76c69376 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -67,7 +67,7 @@ int cpu_map__build_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **res,
 int cpu_map__cpu(struct perf_cpu_map *cpus, int idx);
 bool cpu_map__has(struct perf_cpu_map *cpus, int cpu);
 
-bool cpu_map__compare_aggr_cpu_id(struct aggr_cpu_id a, struct aggr_cpu_id b);
+bool aggr_cpu_id__equal(const struct aggr_cpu_id *a, const struct aggr_cpu_id *b);
 bool cpu_map__aggr_cpu_id_is_empty(struct aggr_cpu_id a);
 struct aggr_cpu_id cpu_map__empty_aggr_cpu_id(void);
 
diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index 6c40b91d5e32..0241436bb1fb 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -328,20 +328,22 @@ static void print_metric_header(struct perf_stat_config *config,
 }
 
 static int first_shadow_cpu(struct perf_stat_config *config,
-			    struct evsel *evsel, struct aggr_cpu_id id)
+			    struct evsel *evsel, const struct aggr_cpu_id *id)
 {
 	struct perf_cpu_map *cpus;
 	int cpu, idx;
 
 	if (config->aggr_mode == AGGR_NONE)
-		return id.core;
+		return id->core;
 
 	if (!config->aggr_get_id)
 		return 0;
 
 	cpus = evsel__cpus(evsel);
 	perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
-		if (cpu_map__compare_aggr_cpu_id(config->aggr_get_id(config, cpu), id))
+		struct aggr_cpu_id cpu_id = config->aggr_get_id(config, cpu);
+
+		if (aggr_cpu_id__equal(&cpu_id, id))
 			return cpu;
 	}
 	return 0;
@@ -501,7 +503,7 @@ static void printout(struct perf_stat_config *config, struct aggr_cpu_id id, int
 	}
 
 	perf_stat__print_shadow_stats(config, counter, uval,
-				first_shadow_cpu(config, counter, id),
+				first_shadow_cpu(config, counter, &id),
 				&out, &config->metric_events, st);
 	if (!config->csv_output && !config->metric_only) {
 		print_noise(config, counter, noise);
@@ -525,12 +527,12 @@ static void aggr_update_shadow(struct perf_stat_config *config,
 			val = 0;
 			perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
 				s2 = config->aggr_get_id(config, cpu);
-				if (!cpu_map__compare_aggr_cpu_id(s2, id))
+				if (!aggr_cpu_id__equal(&s2, &id))
 					continue;
 				val += perf_counts(counter->counts, idx, 0)->val;
 			}
 			perf_stat__update_shadow_stats(counter, val,
-					first_shadow_cpu(config, counter, id),
+					first_shadow_cpu(config, counter, &id),
 					&rt_stat);
 		}
 	}
@@ -641,7 +643,7 @@ static void aggr_cb(struct perf_stat_config *config,
 		struct perf_counts_values *counts;
 
 		s2 = config->aggr_get_id(config, cpu);
-		if (!cpu_map__compare_aggr_cpu_id(s2, ad->id))
+		if (!aggr_cpu_id__equal(&s2, &ad->id))
 			continue;
 		if (first)
 			ad->nr++;
@@ -1217,7 +1219,7 @@ static void print_percore_thread(struct perf_stat_config *config,
 		s2 = config->aggr_get_id(config, cpu);
 		for (s = 0; s < config->aggr_map->nr; s++) {
 			id = config->aggr_map->map[s];
-			if (cpu_map__compare_aggr_cpu_id(s2, id))
+			if (aggr_cpu_id__equal(&s2, &id))
 				break;
 		}
 
-- 
cgit v1.2.3


From 51b826fadf4fc42c8614b752b6cb0cb516589ade Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:17 -0800
Subject: perf cpumap: Rename empty functions

Remove cpu_map from name as a cpu_map isn't used. Pass a const pointer
rather than by value to avoid unnecessary copying.

Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-15-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c      | 12 ++++++------
 tools/perf/util/cpumap.c       | 24 ++++++++++++------------
 tools/perf/util/cpumap.h       |  4 ++--
 tools/perf/util/stat-display.c | 10 +++++-----
 4 files changed, 25 insertions(+), 25 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 40cb3518f27e..d06921cd3592 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1349,9 +1349,9 @@ static struct aggr_cpu_id perf_stat__get_node(struct perf_stat_config *config __
 static struct aggr_cpu_id perf_stat__get_aggr(struct perf_stat_config *config,
 					      aggr_get_id_t get_id, int cpu)
 {
-	struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id();
+	struct aggr_cpu_id id = aggr_cpu_id__empty();
 
-	if (cpu_map__aggr_cpu_id_is_empty(config->cpus_aggr_map->map[cpu]))
+	if (aggr_cpu_id__is_empty(&config->cpus_aggr_map->map[cpu]))
 		config->cpus_aggr_map->map[cpu] = get_id(config, cpu);
 
 	id = config->cpus_aggr_map->map[cpu];
@@ -1480,7 +1480,7 @@ static void perf_stat__exit_aggr_mode(void)
 static struct aggr_cpu_id perf_env__get_socket_aggr_by_cpu(int cpu, void *data)
 {
 	struct perf_env *env = data;
-	struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id();
+	struct aggr_cpu_id id = aggr_cpu_id__empty();
 
 	if (cpu != -1)
 		id.socket = env->cpu[cpu].socket_id;
@@ -1491,7 +1491,7 @@ static struct aggr_cpu_id perf_env__get_socket_aggr_by_cpu(int cpu, void *data)
 static struct aggr_cpu_id perf_env__get_die_aggr_by_cpu(int cpu, void *data)
 {
 	struct perf_env *env = data;
-	struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id();
+	struct aggr_cpu_id id = aggr_cpu_id__empty();
 
 	if (cpu != -1) {
 		/*
@@ -1509,7 +1509,7 @@ static struct aggr_cpu_id perf_env__get_die_aggr_by_cpu(int cpu, void *data)
 static struct aggr_cpu_id perf_env__get_core_aggr_by_cpu(int cpu, void *data)
 {
 	struct perf_env *env = data;
-	struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id();
+	struct aggr_cpu_id id = aggr_cpu_id__empty();
 
 	if (cpu != -1) {
 		/*
@@ -1527,7 +1527,7 @@ static struct aggr_cpu_id perf_env__get_core_aggr_by_cpu(int cpu, void *data)
 
 static struct aggr_cpu_id perf_env__get_node_aggr_by_cpu(int cpu, void *data)
 {
-	struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id();
+	struct aggr_cpu_id id = aggr_cpu_id__empty();
 
 	id.node = perf_env__numa_node(data, cpu);
 	return id;
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 8fa00a6221c8..b3e1304aca0c 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -104,7 +104,7 @@ struct cpu_aggr_map *cpu_aggr_map__empty_new(int nr)
 
 		cpus->nr = nr;
 		for (i = 0; i < nr; i++)
-			cpus->map[i] = cpu_map__empty_aggr_cpu_id();
+			cpus->map[i] = aggr_cpu_id__empty();
 
 		refcount_set(&cpus->refcnt, 1);
 	}
@@ -130,7 +130,7 @@ int cpu_map__get_socket_id(int cpu)
 
 struct aggr_cpu_id cpu_map__get_socket_aggr_by_cpu(int cpu, void *data __maybe_unused)
 {
-	struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id();
+	struct aggr_cpu_id id = aggr_cpu_id__empty();
 
 	id.socket = cpu_map__get_socket_id(cpu);
 	return id;
@@ -209,7 +209,7 @@ struct aggr_cpu_id cpu_map__get_die_aggr_by_cpu(int cpu, void *data)
 	 * make a unique ID.
 	 */
 	id = cpu_map__get_socket_aggr_by_cpu(cpu, data);
-	if (cpu_map__aggr_cpu_id_is_empty(id))
+	if (aggr_cpu_id__is_empty(&id))
 		return id;
 
 	id.die = die;
@@ -234,7 +234,7 @@ struct aggr_cpu_id cpu_map__get_core_aggr_by_cpu(int cpu, void *data)
 
 	/* cpu_map__get_die returns a struct with socket and die set*/
 	id = cpu_map__get_die_aggr_by_cpu(cpu, data);
-	if (cpu_map__aggr_cpu_id_is_empty(id))
+	if (aggr_cpu_id__is_empty(&id))
 		return id;
 
 	/*
@@ -248,7 +248,7 @@ struct aggr_cpu_id cpu_map__get_core_aggr_by_cpu(int cpu, void *data)
 
 struct aggr_cpu_id cpu_map__get_node_aggr_by_cpu(int cpu, void *data __maybe_unused)
 {
-	struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id();
+	struct aggr_cpu_id id = aggr_cpu_id__empty();
 
 	id.node = cpu_map__get_node_id(cpu);
 	return id;
@@ -602,16 +602,16 @@ bool aggr_cpu_id__equal(const struct aggr_cpu_id *a, const struct aggr_cpu_id *b
 		a->core == b->core;
 }
 
-bool cpu_map__aggr_cpu_id_is_empty(struct aggr_cpu_id a)
+bool aggr_cpu_id__is_empty(const struct aggr_cpu_id *a)
 {
-	return a.thread == -1 &&
-		a.node == -1 &&
-		a.socket == -1 &&
-		a.die == -1 &&
-		a.core == -1;
+	return a->thread == -1 &&
+		a->node == -1 &&
+		a->socket == -1 &&
+		a->die == -1 &&
+		a->core == -1;
 }
 
-struct aggr_cpu_id cpu_map__empty_aggr_cpu_id(void)
+struct aggr_cpu_id aggr_cpu_id__empty(void)
 {
 	struct aggr_cpu_id ret = {
 		.thread = -1,
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 652b76c69376..9589b0001a28 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -68,7 +68,7 @@ int cpu_map__cpu(struct perf_cpu_map *cpus, int idx);
 bool cpu_map__has(struct perf_cpu_map *cpus, int cpu);
 
 bool aggr_cpu_id__equal(const struct aggr_cpu_id *a, const struct aggr_cpu_id *b);
-bool cpu_map__aggr_cpu_id_is_empty(struct aggr_cpu_id a);
-struct aggr_cpu_id cpu_map__empty_aggr_cpu_id(void);
+bool aggr_cpu_id__is_empty(const struct aggr_cpu_id *a);
+struct aggr_cpu_id aggr_cpu_id__empty(void);
 
 #endif /* __PERF_CPUMAP_H */
diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index 0241436bb1fb..870b1db71fbc 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -698,7 +698,7 @@ static void print_counter_aggrdata(struct perf_stat_config *config,
 
 	uval = val * counter->scale;
 	if (cpu != -1) {
-		id = cpu_map__empty_aggr_cpu_id();
+		id = aggr_cpu_id__empty();
 		id.core = cpu;
 	}
 	printout(config, id, nr, counter, uval,
@@ -780,7 +780,7 @@ static struct perf_aggr_thread_value *sort_aggr_thread(
 			continue;
 
 		buf[i].counter = counter;
-		buf[i].id = cpu_map__empty_aggr_cpu_id();
+		buf[i].id = aggr_cpu_id__empty();
 		buf[i].id.thread = thread;
 		buf[i].uval = uval;
 		buf[i].val = val;
@@ -868,7 +868,7 @@ static void print_counter_aggr(struct perf_stat_config *config,
 		fprintf(output, "%s", prefix);
 
 	uval = cd.avg * counter->scale;
-	printout(config, cpu_map__empty_aggr_cpu_id(), 0, counter, uval, prefix, cd.avg_running,
+	printout(config, aggr_cpu_id__empty(), 0, counter, uval, prefix, cd.avg_running,
 		 cd.avg_enabled, cd.avg, &rt_stat);
 	if (!metric_only)
 		fprintf(output, "\n");
@@ -911,7 +911,7 @@ static void print_counter(struct perf_stat_config *config,
 			fprintf(output, "%s", prefix);
 
 		uval = val * counter->scale;
-		id = cpu_map__empty_aggr_cpu_id();
+		id = aggr_cpu_id__empty();
 		id.core = cpu;
 		printout(config, id, 0, counter, uval, prefix,
 			 run, ena, 1.0, &rt_stat);
@@ -938,7 +938,7 @@ static void print_no_aggr_metric(struct perf_stat_config *config,
 		if (prefix)
 			fputs(prefix, config->output);
 		evlist__for_each_entry(evlist, counter) {
-			id = cpu_map__empty_aggr_cpu_id();
+			id = aggr_cpu_id__empty();
 			id.core = cpu;
 			if (first) {
 				aggr_printout(config, counter, id, 0);
-- 
cgit v1.2.3


From 194a3a202564153493789997643181737a6ae4b9 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:18 -0800
Subject: perf cpumap: Document cpu__get_node() and remove redundant function

cpu_map__get_node_id() isn't used externally and merely delegates to
cpu__get_node().

Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-16-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/cpumap.c | 11 +++++------
 tools/perf/util/cpumap.h |  5 ++++-
 2 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index b3e1304aca0c..1626b0991408 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -16,6 +16,10 @@
 static int max_cpu_num;
 static int max_present_cpu_num;
 static int max_node_num;
+/**
+ * The numa node X as read from /sys/devices/system/node/nodeX indexed by the
+ * CPU number.
+ */
 static int *cpunode_map;
 
 static struct perf_cpu_map *cpu_map__from_entries(struct cpu_map_entries *cpus)
@@ -222,11 +226,6 @@ int cpu_map__get_core_id(int cpu)
 	return ret ?: value;
 }
 
-int cpu_map__get_node_id(int cpu)
-{
-	return cpu__get_node(cpu);
-}
-
 struct aggr_cpu_id cpu_map__get_core_aggr_by_cpu(int cpu, void *data)
 {
 	struct aggr_cpu_id id;
@@ -250,7 +249,7 @@ struct aggr_cpu_id cpu_map__get_node_aggr_by_cpu(int cpu, void *data __maybe_unu
 {
 	struct aggr_cpu_id id = aggr_cpu_id__empty();
 
-	id.node = cpu_map__get_node_id(cpu);
+	id.node = cpu__get_node(cpu);
 	return id;
 }
 
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 9589b0001a28..f849f01c5860 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -45,7 +45,6 @@ int cpu_map__get_die_id(int cpu);
 struct aggr_cpu_id cpu_map__get_die_aggr_by_cpu(int cpu, void *data);
 int cpu_map__get_core_id(int cpu);
 struct aggr_cpu_id cpu_map__get_core_aggr_by_cpu(int cpu, void *data);
-int cpu_map__get_node_id(int cpu);
 struct aggr_cpu_id cpu_map__get_node_aggr_by_cpu(int cpu, void *data);
 int cpu_map__build_socket_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **sockp);
 int cpu_map__build_die_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **diep);
@@ -58,6 +57,10 @@ int cpu__setup_cpunode_map(void);
 int cpu__max_node(void);
 int cpu__max_cpu(void);
 int cpu__max_present_cpu(void);
+/**
+ * cpu__get_node - Returns the numa node X as read from
+ * /sys/devices/system/node/nodeX for the given CPU.
+ */
 int cpu__get_node(int cpu);
 
 int cpu_map__build_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **res,
-- 
cgit v1.2.3


From 4e90e5cc74c6b1c1b9abff8b53cec5be1fb5e839 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:19 -0800
Subject: perf cpumap: Remove map from function names that don't use a map

Move to the cpu name and document for consistency.

Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-17-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/cpumap.c | 12 ++++++------
 tools/perf/util/cpumap.h | 19 ++++++++++++++++---
 tools/perf/util/env.c    |  6 +++---
 tools/perf/util/stat.c   |  4 ++--
 4 files changed, 27 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 1626b0991408..e0d7f1da5858 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -126,7 +126,7 @@ static int cpu__get_topology_int(int cpu, const char *name, int *value)
 	return sysfs__read_int(path, value);
 }
 
-int cpu_map__get_socket_id(int cpu)
+int cpu__get_socket_id(int cpu)
 {
 	int value, ret = cpu__get_topology_int(cpu, "physical_package_id", &value);
 	return ret ?: value;
@@ -136,7 +136,7 @@ struct aggr_cpu_id cpu_map__get_socket_aggr_by_cpu(int cpu, void *data __maybe_u
 {
 	struct aggr_cpu_id id = aggr_cpu_id__empty();
 
-	id.socket = cpu_map__get_socket_id(cpu);
+	id.socket = cpu__get_socket_id(cpu);
 	return id;
 }
 
@@ -190,7 +190,7 @@ int cpu_map__build_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **res,
 	return 0;
 }
 
-int cpu_map__get_die_id(int cpu)
+int cpu__get_die_id(int cpu)
 {
 	int value, ret = cpu__get_topology_int(cpu, "die_id", &value);
 
@@ -202,7 +202,7 @@ struct aggr_cpu_id cpu_map__get_die_aggr_by_cpu(int cpu, void *data)
 	struct aggr_cpu_id id;
 	int die;
 
-	die = cpu_map__get_die_id(cpu);
+	die = cpu__get_die_id(cpu);
 	/* There is no die_id on legacy system. */
 	if (die == -1)
 		die = 0;
@@ -220,7 +220,7 @@ struct aggr_cpu_id cpu_map__get_die_aggr_by_cpu(int cpu, void *data)
 	return id;
 }
 
-int cpu_map__get_core_id(int cpu)
+int cpu__get_core_id(int cpu)
 {
 	int value, ret = cpu__get_topology_int(cpu, "core_id", &value);
 	return ret ?: value;
@@ -229,7 +229,7 @@ int cpu_map__get_core_id(int cpu)
 struct aggr_cpu_id cpu_map__get_core_aggr_by_cpu(int cpu, void *data)
 {
 	struct aggr_cpu_id id;
-	int core = cpu_map__get_core_id(cpu);
+	int core = cpu__get_core_id(cpu);
 
 	/* cpu_map__get_die returns a struct with socket and die set*/
 	id = cpu_map__get_die_aggr_by_cpu(cpu, data);
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index f849f01c5860..a053bf31a3f0 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -39,11 +39,8 @@ struct perf_cpu_map *cpu_map__new_data(struct perf_record_cpu_map_data *data);
 size_t cpu_map__snprint(struct perf_cpu_map *map, char *buf, size_t size);
 size_t cpu_map__snprint_mask(struct perf_cpu_map *map, char *buf, size_t size);
 size_t cpu_map__fprintf(struct perf_cpu_map *map, FILE *fp);
-int cpu_map__get_socket_id(int cpu);
 struct aggr_cpu_id cpu_map__get_socket_aggr_by_cpu(int cpu, void *data);
-int cpu_map__get_die_id(int cpu);
 struct aggr_cpu_id cpu_map__get_die_aggr_by_cpu(int cpu, void *data);
-int cpu_map__get_core_id(int cpu);
 struct aggr_cpu_id cpu_map__get_core_aggr_by_cpu(int cpu, void *data);
 struct aggr_cpu_id cpu_map__get_node_aggr_by_cpu(int cpu, void *data);
 int cpu_map__build_socket_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **sockp);
@@ -62,6 +59,22 @@ int cpu__max_present_cpu(void);
  * /sys/devices/system/node/nodeX for the given CPU.
  */
 int cpu__get_node(int cpu);
+/**
+ * cpu__get_socket_id - Returns the socket number as read from
+ * /sys/devices/system/cpu/cpuX/topology/physical_package_id for the given CPU.
+ */
+int cpu__get_socket_id(int cpu);
+/**
+ * cpu__get_die_id - Returns the die id as read from
+ * /sys/devices/system/cpu/cpuX/topology/die_id for the given CPU.
+ */
+int cpu__get_die_id(int cpu);
+/**
+ * cpu__get_core_id - Returns the core id as read from
+ * /sys/devices/system/cpu/cpuX/topology/core_id for the given CPU.
+ */
+int cpu__get_core_id(int cpu);
+
 
 int cpu_map__build_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **res,
 		       struct aggr_cpu_id (*f)(int cpu, void *data),
diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index b9904896eb97..fd12c0dcaefb 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -302,9 +302,9 @@ int perf_env__read_cpu_topology_map(struct perf_env *env)
 		return -ENOMEM;
 
 	for (cpu = 0; cpu < nr_cpus; ++cpu) {
-		env->cpu[cpu].core_id	= cpu_map__get_core_id(cpu);
-		env->cpu[cpu].socket_id	= cpu_map__get_socket_id(cpu);
-		env->cpu[cpu].die_id	= cpu_map__get_die_id(cpu);
+		env->cpu[cpu].core_id	= cpu__get_core_id(cpu);
+		env->cpu[cpu].socket_id	= cpu__get_socket_id(cpu);
+		env->cpu[cpu].die_id	= cpu__get_die_id(cpu);
 	}
 
 	env->nr_cpus_avail = nr_cpus;
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index 5ed99bcfe91e..5c24aca0968c 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -328,7 +328,7 @@ static int check_per_pkg(struct evsel *counter,
 	if (!(vals->run && vals->ena))
 		return 0;
 
-	s = cpu_map__get_socket_id(cpu);
+	s = cpu__get_socket_id(cpu);
 	if (s < 0)
 		return -1;
 
@@ -336,7 +336,7 @@ static int check_per_pkg(struct evsel *counter,
 	 * On multi-die system, die_id > 0. On no-die system, die_id = 0.
 	 * We use hashmap(socket, die) to check the used socket+die pair.
 	 */
-	d = cpu_map__get_die_id(cpu);
+	d = cpu__get_die_id(cpu);
 	if (d < 0)
 		return -1;
 
-- 
cgit v1.2.3


From adff2c634357115a0f94a9a5054061b497df7f72 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:20 -0800
Subject: perf cpumap: Remove cpu_map__cpu(), use libperf function

Switch the remaining few users of cpu_map__cpu() to perf_cpu_map__cpu()
and remove the function.

Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-18-irogers@google.com
[ Did the conversion to perf_ftrace__latency_prepare_bpf() as well, used when building with BUILD_BPF_SKEL=1 ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-ftrace.c  | 2 +-
 tools/perf/util/bpf_ftrace.c | 2 +-
 tools/perf/util/cpumap.c     | 9 ++-------
 tools/perf/util/cpumap.h     | 1 -
 4 files changed, 4 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c
index 2b54e2ddc80a..f16c39a37a52 100644
--- a/tools/perf/builtin-ftrace.c
+++ b/tools/perf/builtin-ftrace.c
@@ -281,7 +281,7 @@ static int set_tracing_cpumask(struct perf_cpu_map *cpumap)
 	int ret;
 	int last_cpu;
 
-	last_cpu = cpu_map__cpu(cpumap, cpumap->nr - 1);
+	last_cpu = perf_cpu_map__cpu(cpumap, cpumap->nr - 1);
 	mask_size = last_cpu / 4 + 2; /* one more byte for EOS */
 	mask_size += last_cpu / 32; /* ',' is needed for every 32th cpus */
 
diff --git a/tools/perf/util/bpf_ftrace.c b/tools/perf/util/bpf_ftrace.c
index f00a2de6778c..28dc4c60c788 100644
--- a/tools/perf/util/bpf_ftrace.c
+++ b/tools/perf/util/bpf_ftrace.c
@@ -63,7 +63,7 @@ int perf_ftrace__latency_prepare_bpf(struct perf_ftrace *ftrace)
 		fd = bpf_map__fd(skel->maps.cpu_filter);
 
 		for (i = 0; i < ncpus; i++) {
-			cpu = cpu_map__cpu(ftrace->evlist->core.cpus, i);
+			cpu = perf_cpu_map__cpu(ftrace->evlist->core.cpus, i);
 			bpf_map_update_elem(fd, &cpu, &val, BPF_ANY);
 		}
 	}
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index e0d7f1da5858..32f9fc2dd389 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -485,11 +485,6 @@ bool cpu_map__has(struct perf_cpu_map *cpus, int cpu)
 	return perf_cpu_map__idx(cpus, cpu) != -1;
 }
 
-int cpu_map__cpu(struct perf_cpu_map *cpus, int idx)
-{
-	return cpus->map[idx];
-}
-
 size_t cpu_map__snprint(struct perf_cpu_map *map, char *buf, size_t size)
 {
 	int i, cpu, start = -1;
@@ -547,7 +542,7 @@ size_t cpu_map__snprint_mask(struct perf_cpu_map *map, char *buf, size_t size)
 	int i, cpu;
 	char *ptr = buf;
 	unsigned char *bitmap;
-	int last_cpu = cpu_map__cpu(map, map->nr - 1);
+	int last_cpu = perf_cpu_map__cpu(map, map->nr - 1);
 
 	if (buf == NULL)
 		return 0;
@@ -559,7 +554,7 @@ size_t cpu_map__snprint_mask(struct perf_cpu_map *map, char *buf, size_t size)
 	}
 
 	for (i = 0; i < map->nr; i++) {
-		cpu = cpu_map__cpu(map, i);
+		cpu = perf_cpu_map__cpu(map, i);
 		bitmap[cpu / 8] |= 1 << (cpu % 8);
 	}
 
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index a053bf31a3f0..87545bcd461d 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -80,7 +80,6 @@ int cpu_map__build_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **res,
 		       struct aggr_cpu_id (*f)(int cpu, void *data),
 		       void *data);
 
-int cpu_map__cpu(struct perf_cpu_map *cpus, int idx);
 bool cpu_map__has(struct perf_cpu_map *cpus, int cpu);
 
 bool aggr_cpu_id__equal(const struct aggr_cpu_id *a, const struct aggr_cpu_id *b);
-- 
cgit v1.2.3


From 5f50e15c1510c77b37e10c6b22912bf4bf11476b Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:21 -0800
Subject: perf cpumap: Refactor cpu_map__build_map()

Turn it into a cpu_aggr_map__new(). Pass helper functions. Refactor
builtin-stat calls to manually pass function pointers. Try to reduce
some copy-paste code.

Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-19-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 187 +++++++++++++++++++++++++---------------------
 tools/perf/util/cpumap.c  |  59 ++++++---------
 tools/perf/util/cpumap.h  |  16 ++--
 3 files changed, 130 insertions(+), 132 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index d06921cd3592..c55a7fee22bc 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1322,6 +1322,17 @@ static struct option stat_options[] = {
 	OPT_END()
 };
 
+static const char *const aggr_mode__string[] = {
+	[AGGR_CORE] = "core",
+	[AGGR_DIE] = "die",
+	[AGGR_GLOBAL] = "global",
+	[AGGR_NODE] = "node",
+	[AGGR_NONE] = "none",
+	[AGGR_SOCKET] = "socket",
+	[AGGR_THREAD] = "thread",
+	[AGGR_UNSET] = "unset",
+};
+
 static struct aggr_cpu_id perf_stat__get_socket(struct perf_stat_config *config __maybe_unused,
 						int cpu)
 {
@@ -1394,54 +1405,67 @@ static bool term_percore_set(void)
 	return false;
 }
 
-static int perf_stat_init_aggr_mode(void)
+static aggr_cpu_id_get_t aggr_mode__get_aggr(enum aggr_mode aggr_mode)
 {
-	int nr;
+	switch (aggr_mode) {
+	case AGGR_SOCKET:
+		return cpu_map__get_socket_aggr_by_cpu;
+	case AGGR_DIE:
+		return cpu_map__get_die_aggr_by_cpu;
+	case AGGR_CORE:
+		return cpu_map__get_core_aggr_by_cpu;
+	case AGGR_NODE:
+		return cpu_map__get_node_aggr_by_cpu;
+	case AGGR_NONE:
+		if (term_percore_set())
+			return cpu_map__get_core_aggr_by_cpu;
+
+		return NULL;
+	case AGGR_GLOBAL:
+	case AGGR_THREAD:
+	case AGGR_UNSET:
+	default:
+		return NULL;
+	}
+}
 
-	switch (stat_config.aggr_mode) {
+static aggr_get_id_t aggr_mode__get_id(enum aggr_mode aggr_mode)
+{
+	switch (aggr_mode) {
 	case AGGR_SOCKET:
-		if (cpu_map__build_socket_map(evsel_list->core.cpus, &stat_config.aggr_map)) {
-			perror("cannot build socket map");
-			return -1;
-		}
-		stat_config.aggr_get_id = perf_stat__get_socket_cached;
-		break;
+		return perf_stat__get_socket_cached;
 	case AGGR_DIE:
-		if (cpu_map__build_die_map(evsel_list->core.cpus, &stat_config.aggr_map)) {
-			perror("cannot build die map");
-			return -1;
-		}
-		stat_config.aggr_get_id = perf_stat__get_die_cached;
-		break;
+		return perf_stat__get_die_cached;
 	case AGGR_CORE:
-		if (cpu_map__build_core_map(evsel_list->core.cpus, &stat_config.aggr_map)) {
-			perror("cannot build core map");
-			return -1;
-		}
-		stat_config.aggr_get_id = perf_stat__get_core_cached;
-		break;
+		return perf_stat__get_core_cached;
 	case AGGR_NODE:
-		if (cpu_map__build_node_map(evsel_list->core.cpus, &stat_config.aggr_map)) {
-			perror("cannot build core map");
-			return -1;
-		}
-		stat_config.aggr_get_id = perf_stat__get_node_cached;
-		break;
+		return perf_stat__get_node_cached;
 	case AGGR_NONE:
 		if (term_percore_set()) {
-			if (cpu_map__build_core_map(evsel_list->core.cpus,
-						    &stat_config.aggr_map)) {
-				perror("cannot build core map");
-				return -1;
-			}
-			stat_config.aggr_get_id = perf_stat__get_core_cached;
+			return perf_stat__get_core_cached;
 		}
-		break;
+		return NULL;
 	case AGGR_GLOBAL:
 	case AGGR_THREAD:
 	case AGGR_UNSET:
 	default:
-		break;
+		return NULL;
+	}
+}
+
+static int perf_stat_init_aggr_mode(void)
+{
+	int nr;
+	aggr_cpu_id_get_t get_id = aggr_mode__get_aggr(stat_config.aggr_mode);
+
+	if (get_id) {
+		stat_config.aggr_map = cpu_aggr_map__new(evsel_list->core.cpus,
+							 get_id, /*data=*/NULL);
+		if (!stat_config.aggr_map) {
+			pr_err("cannot build %s map", aggr_mode__string[stat_config.aggr_mode]);
+			return -1;
+		}
+		stat_config.aggr_get_id = aggr_mode__get_id(stat_config.aggr_mode);
 	}
 
 	/*
@@ -1533,30 +1557,6 @@ static struct aggr_cpu_id perf_env__get_node_aggr_by_cpu(int cpu, void *data)
 	return id;
 }
 
-static int perf_env__build_socket_map(struct perf_env *env, struct perf_cpu_map *cpus,
-				      struct cpu_aggr_map **sockp)
-{
-	return cpu_map__build_map(cpus, sockp, perf_env__get_socket_aggr_by_cpu, env);
-}
-
-static int perf_env__build_die_map(struct perf_env *env, struct perf_cpu_map *cpus,
-				   struct cpu_aggr_map **diep)
-{
-	return cpu_map__build_map(cpus, diep, perf_env__get_die_aggr_by_cpu, env);
-}
-
-static int perf_env__build_core_map(struct perf_env *env, struct perf_cpu_map *cpus,
-				    struct cpu_aggr_map **corep)
-{
-	return cpu_map__build_map(cpus, corep, perf_env__get_core_aggr_by_cpu, env);
-}
-
-static int perf_env__build_node_map(struct perf_env *env, struct perf_cpu_map *cpus,
-				    struct cpu_aggr_map **nodep)
-{
-	return cpu_map__build_map(cpus, nodep, perf_env__get_node_aggr_by_cpu, env);
-}
-
 static struct aggr_cpu_id perf_stat__get_socket_file(struct perf_stat_config *config __maybe_unused,
 						     int cpu)
 {
@@ -1580,47 +1580,60 @@ static struct aggr_cpu_id perf_stat__get_node_file(struct perf_stat_config *conf
 	return perf_env__get_node_aggr_by_cpu(cpu, &perf_stat.session->header.env);
 }
 
-static int perf_stat_init_aggr_mode_file(struct perf_stat *st)
+static aggr_cpu_id_get_t aggr_mode__get_aggr_file(enum aggr_mode aggr_mode)
 {
-	struct perf_env *env = &st->session->header.env;
+	switch (aggr_mode) {
+	case AGGR_SOCKET:
+		return perf_env__get_socket_aggr_by_cpu;
+	case AGGR_DIE:
+		return perf_env__get_die_aggr_by_cpu;
+	case AGGR_CORE:
+		return perf_env__get_core_aggr_by_cpu;
+	case AGGR_NODE:
+		return perf_env__get_node_aggr_by_cpu;
+	case AGGR_NONE:
+	case AGGR_GLOBAL:
+	case AGGR_THREAD:
+	case AGGR_UNSET:
+	default:
+		return NULL;
+	}
+}
 
-	switch (stat_config.aggr_mode) {
+static aggr_get_id_t aggr_mode__get_id_file(enum aggr_mode aggr_mode)
+{
+	switch (aggr_mode) {
 	case AGGR_SOCKET:
-		if (perf_env__build_socket_map(env, evsel_list->core.cpus, &stat_config.aggr_map)) {
-			perror("cannot build socket map");
-			return -1;
-		}
-		stat_config.aggr_get_id = perf_stat__get_socket_file;
-		break;
+		return perf_stat__get_socket_file;
 	case AGGR_DIE:
-		if (perf_env__build_die_map(env, evsel_list->core.cpus, &stat_config.aggr_map)) {
-			perror("cannot build die map");
-			return -1;
-		}
-		stat_config.aggr_get_id = perf_stat__get_die_file;
-		break;
+		return perf_stat__get_die_file;
 	case AGGR_CORE:
-		if (perf_env__build_core_map(env, evsel_list->core.cpus, &stat_config.aggr_map)) {
-			perror("cannot build core map");
-			return -1;
-		}
-		stat_config.aggr_get_id = perf_stat__get_core_file;
-		break;
+		return perf_stat__get_core_file;
 	case AGGR_NODE:
-		if (perf_env__build_node_map(env, evsel_list->core.cpus, &stat_config.aggr_map)) {
-			perror("cannot build core map");
-			return -1;
-		}
-		stat_config.aggr_get_id = perf_stat__get_node_file;
-		break;
+		return perf_stat__get_node_file;
 	case AGGR_NONE:
 	case AGGR_GLOBAL:
 	case AGGR_THREAD:
 	case AGGR_UNSET:
 	default:
-		break;
+		return NULL;
 	}
+}
+
+static int perf_stat_init_aggr_mode_file(struct perf_stat *st)
+{
+	struct perf_env *env = &st->session->header.env;
+	aggr_cpu_id_get_t get_id = aggr_mode__get_aggr_file(stat_config.aggr_mode);
 
+	if (!get_id)
+		return 0;
+
+	stat_config.aggr_map = cpu_aggr_map__new(evsel_list->core.cpus, get_id, env);
+	if (!stat_config.aggr_map) {
+		pr_err("cannot build %s map", aggr_mode__string[stat_config.aggr_mode]);
+		return -1;
+	}
+	stat_config.aggr_get_id = aggr_mode__get_id_file(stat_config.aggr_mode);
 	return 0;
 }
 
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 32f9fc2dd389..c8f9b3f15759 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -140,7 +140,7 @@ struct aggr_cpu_id cpu_map__get_socket_aggr_by_cpu(int cpu, void *data __maybe_u
 	return id;
 }
 
-static int cmp_aggr_cpu_id(const void *a_pointer, const void *b_pointer)
+static int aggr_cpu_id__cmp(const void *a_pointer, const void *b_pointer)
 {
 	struct aggr_cpu_id *a = (struct aggr_cpu_id *)a_pointer;
 	struct aggr_cpu_id *b = (struct aggr_cpu_id *)b_pointer;
@@ -157,37 +157,40 @@ static int cmp_aggr_cpu_id(const void *a_pointer, const void *b_pointer)
 		return a->thread - b->thread;
 }
 
-int cpu_map__build_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **res,
-		       struct aggr_cpu_id (*f)(int cpu, void *data),
-		       void *data)
+struct cpu_aggr_map *cpu_aggr_map__new(const struct perf_cpu_map *cpus,
+				       aggr_cpu_id_get_t get_id,
+				       void *data)
 {
-	int nr = cpus->nr;
-	struct cpu_aggr_map *c = cpu_aggr_map__empty_new(nr);
-	int cpu, s2;
-	struct aggr_cpu_id s1;
+	int cpu, idx;
+	struct cpu_aggr_map *c = cpu_aggr_map__empty_new(cpus->nr);
 
 	if (!c)
-		return -1;
+		return NULL;
 
 	/* Reset size as it may only be partially filled */
 	c->nr = 0;
 
-	for (cpu = 0; cpu < nr; cpu++) {
-		s1 = f(cpu, data);
-		for (s2 = 0; s2 < c->nr; s2++) {
-			if (aggr_cpu_id__equal(&s1, &c->map[s2]))
+	perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
+		bool duplicate = false;
+		struct aggr_cpu_id cpu_id = get_id(cpu, data);
+
+		for (int j = 0; j < c->nr; j++) {
+			if (aggr_cpu_id__equal(&cpu_id, &c->map[j])) {
+				duplicate = true;
 				break;
+			}
 		}
-		if (s2 == c->nr) {
-			c->map[c->nr] = s1;
+		if (!duplicate) {
+			c->map[c->nr] = cpu_id;
 			c->nr++;
 		}
 	}
+
 	/* ensure we process id in increasing order */
-	qsort(c->map, c->nr, sizeof(struct aggr_cpu_id), cmp_aggr_cpu_id);
+	qsort(c->map, c->nr, sizeof(struct aggr_cpu_id), aggr_cpu_id__cmp);
+
+	return c;
 
-	*res = c;
-	return 0;
 }
 
 int cpu__get_die_id(int cpu)
@@ -253,26 +256,6 @@ struct aggr_cpu_id cpu_map__get_node_aggr_by_cpu(int cpu, void *data __maybe_unu
 	return id;
 }
 
-int cpu_map__build_socket_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **sockp)
-{
-	return cpu_map__build_map(cpus, sockp, cpu_map__get_socket_aggr_by_cpu, NULL);
-}
-
-int cpu_map__build_die_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **diep)
-{
-	return cpu_map__build_map(cpus, diep, cpu_map__get_die_aggr_by_cpu, NULL);
-}
-
-int cpu_map__build_core_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **corep)
-{
-	return cpu_map__build_map(cpus, corep, cpu_map__get_core_aggr_by_cpu, NULL);
-}
-
-int cpu_map__build_node_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **numap)
-{
-	return cpu_map__build_map(cpus, numap, cpu_map__get_node_aggr_by_cpu, NULL);
-}
-
 /* setup simple routines to easily access node numbers given a cpu number */
 static int get_max_num(char *path, int *max)
 {
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 87545bcd461d..611048e2a592 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -43,10 +43,6 @@ struct aggr_cpu_id cpu_map__get_socket_aggr_by_cpu(int cpu, void *data);
 struct aggr_cpu_id cpu_map__get_die_aggr_by_cpu(int cpu, void *data);
 struct aggr_cpu_id cpu_map__get_core_aggr_by_cpu(int cpu, void *data);
 struct aggr_cpu_id cpu_map__get_node_aggr_by_cpu(int cpu, void *data);
-int cpu_map__build_socket_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **sockp);
-int cpu_map__build_die_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **diep);
-int cpu_map__build_core_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **corep);
-int cpu_map__build_node_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **nodep);
 const struct perf_cpu_map *cpu_map__online(void); /* thread unsafe */
 
 int cpu__setup_cpunode_map(void);
@@ -75,10 +71,16 @@ int cpu__get_die_id(int cpu);
  */
 int cpu__get_core_id(int cpu);
 
+typedef struct aggr_cpu_id (*aggr_cpu_id_get_t)(int cpu, void *data);
 
-int cpu_map__build_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **res,
-		       struct aggr_cpu_id (*f)(int cpu, void *data),
-		       void *data);
+/**
+ * cpu_aggr_map__new - Create a cpu_aggr_map with an aggr_cpu_id for each cpu in
+ * cpus. The aggr_cpu_id is created with 'get_id' that may have a data value
+ * passed to it. The cpu_aggr_map is sorted with duplicate values removed.
+ */
+struct cpu_aggr_map *cpu_aggr_map__new(const struct perf_cpu_map *cpus,
+				       aggr_cpu_id_get_t get_id,
+				       void *data);
 
 bool cpu_map__has(struct perf_cpu_map *cpus, int cpu);
 
-- 
cgit v1.2.3


From 973aeb3c7ada35b75442126c745bb6074cb3e172 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:22 -0800
Subject: perf cpumap: Rename cpu_map__get_X_aggr_by_cpu functions

The functions don't use a cpu_map so reduce them to being like
constructors of aggr_cpu_id.

Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-20-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c   | 18 +++++++++---------
 tools/perf/tests/topology.c |  8 ++++----
 tools/perf/util/cpumap.c    | 14 +++++++-------
 tools/perf/util/cpumap.h    | 29 +++++++++++++++++++++++++----
 4 files changed, 45 insertions(+), 24 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index c55a7fee22bc..a518fcf0b3f8 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1336,25 +1336,25 @@ static const char *const aggr_mode__string[] = {
 static struct aggr_cpu_id perf_stat__get_socket(struct perf_stat_config *config __maybe_unused,
 						int cpu)
 {
-	return cpu_map__get_socket_aggr_by_cpu(cpu, /*data=*/NULL);
+	return aggr_cpu_id__socket(cpu, /*data=*/NULL);
 }
 
 static struct aggr_cpu_id perf_stat__get_die(struct perf_stat_config *config __maybe_unused,
 					     int cpu)
 {
-	return cpu_map__get_die_aggr_by_cpu(cpu, /*data=*/NULL);
+	return aggr_cpu_id__die(cpu, /*data=*/NULL);
 }
 
 static struct aggr_cpu_id perf_stat__get_core(struct perf_stat_config *config __maybe_unused,
 					      int cpu)
 {
-	return cpu_map__get_core_aggr_by_cpu(cpu, /*data=*/NULL);
+	return aggr_cpu_id__core(cpu, /*data=*/NULL);
 }
 
 static struct aggr_cpu_id perf_stat__get_node(struct perf_stat_config *config __maybe_unused,
 					      int cpu)
 {
-	return cpu_map__get_node_aggr_by_cpu(cpu, /*data=*/NULL);
+	return aggr_cpu_id__node(cpu, /*data=*/NULL);
 }
 
 static struct aggr_cpu_id perf_stat__get_aggr(struct perf_stat_config *config,
@@ -1409,16 +1409,16 @@ static aggr_cpu_id_get_t aggr_mode__get_aggr(enum aggr_mode aggr_mode)
 {
 	switch (aggr_mode) {
 	case AGGR_SOCKET:
-		return cpu_map__get_socket_aggr_by_cpu;
+		return aggr_cpu_id__socket;
 	case AGGR_DIE:
-		return cpu_map__get_die_aggr_by_cpu;
+		return aggr_cpu_id__die;
 	case AGGR_CORE:
-		return cpu_map__get_core_aggr_by_cpu;
+		return aggr_cpu_id__core;
 	case AGGR_NODE:
-		return cpu_map__get_node_aggr_by_cpu;
+		return aggr_cpu_id__node;
 	case AGGR_NONE:
 		if (term_percore_set())
-			return cpu_map__get_core_aggr_by_cpu;
+			return aggr_cpu_id__core;
 
 		return NULL;
 	case AGGR_GLOBAL:
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 5992b323c4f5..0cb7b015b4b9 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -121,7 +121,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map)
 
 	// Test that core ID contains socket, die and core
 	for (i = 0; i < map->nr; i++) {
-		id = cpu_map__get_core_aggr_by_cpu(perf_cpu_map__cpu(map, i), NULL);
+		id = aggr_cpu_id__core(perf_cpu_map__cpu(map, i), NULL);
 		TEST_ASSERT_VAL("Core map - Core ID doesn't match",
 			session->header.env.cpu[map->map[i]].core_id == id.core);
 
@@ -136,7 +136,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map)
 
 	// Test that die ID contains socket and die
 	for (i = 0; i < map->nr; i++) {
-		id = cpu_map__get_die_aggr_by_cpu(perf_cpu_map__cpu(map, i), NULL);
+		id = aggr_cpu_id__die(perf_cpu_map__cpu(map, i), NULL);
 		TEST_ASSERT_VAL("Die map - Socket ID doesn't match",
 			session->header.env.cpu[map->map[i]].socket_id == id.socket);
 
@@ -150,7 +150,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map)
 
 	// Test that socket ID contains only socket
 	for (i = 0; i < map->nr; i++) {
-		id = cpu_map__get_socket_aggr_by_cpu(perf_cpu_map__cpu(map, i), NULL);
+		id = aggr_cpu_id__socket(perf_cpu_map__cpu(map, i), NULL);
 		TEST_ASSERT_VAL("Socket map - Socket ID doesn't match",
 			session->header.env.cpu[map->map[i]].socket_id == id.socket);
 
@@ -162,7 +162,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map)
 
 	// Test that node ID contains only node
 	for (i = 0; i < map->nr; i++) {
-		id = cpu_map__get_node_aggr_by_cpu(perf_cpu_map__cpu(map, i), NULL);
+		id = aggr_cpu_id__node(perf_cpu_map__cpu(map, i), NULL);
 		TEST_ASSERT_VAL("Node map - Node ID doesn't match",
 			cpu__get_node(map->map[i]) == id.node);
 		TEST_ASSERT_VAL("Node map - Socket is set", id.socket == -1);
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index c8f9b3f15759..19e502cc65e7 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -132,7 +132,7 @@ int cpu__get_socket_id(int cpu)
 	return ret ?: value;
 }
 
-struct aggr_cpu_id cpu_map__get_socket_aggr_by_cpu(int cpu, void *data __maybe_unused)
+struct aggr_cpu_id aggr_cpu_id__socket(int cpu, void *data __maybe_unused)
 {
 	struct aggr_cpu_id id = aggr_cpu_id__empty();
 
@@ -200,7 +200,7 @@ int cpu__get_die_id(int cpu)
 	return ret ?: value;
 }
 
-struct aggr_cpu_id cpu_map__get_die_aggr_by_cpu(int cpu, void *data)
+struct aggr_cpu_id aggr_cpu_id__die(int cpu, void *data)
 {
 	struct aggr_cpu_id id;
 	int die;
@@ -215,7 +215,7 @@ struct aggr_cpu_id cpu_map__get_die_aggr_by_cpu(int cpu, void *data)
 	 * with the socket ID and then add die to
 	 * make a unique ID.
 	 */
-	id = cpu_map__get_socket_aggr_by_cpu(cpu, data);
+	id = aggr_cpu_id__socket(cpu, data);
 	if (aggr_cpu_id__is_empty(&id))
 		return id;
 
@@ -229,13 +229,13 @@ int cpu__get_core_id(int cpu)
 	return ret ?: value;
 }
 
-struct aggr_cpu_id cpu_map__get_core_aggr_by_cpu(int cpu, void *data)
+struct aggr_cpu_id aggr_cpu_id__core(int cpu, void *data)
 {
 	struct aggr_cpu_id id;
 	int core = cpu__get_core_id(cpu);
 
-	/* cpu_map__get_die returns a struct with socket and die set*/
-	id = cpu_map__get_die_aggr_by_cpu(cpu, data);
+	/* aggr_cpu_id__die returns a struct with socket and die set*/
+	id = aggr_cpu_id__die(cpu, data);
 	if (aggr_cpu_id__is_empty(&id))
 		return id;
 
@@ -248,7 +248,7 @@ struct aggr_cpu_id cpu_map__get_core_aggr_by_cpu(int cpu, void *data)
 
 }
 
-struct aggr_cpu_id cpu_map__get_node_aggr_by_cpu(int cpu, void *data __maybe_unused)
+struct aggr_cpu_id aggr_cpu_id__node(int cpu, void *data __maybe_unused)
 {
 	struct aggr_cpu_id id = aggr_cpu_id__empty();
 
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 611048e2a592..ecd658293a2d 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -39,10 +39,6 @@ struct perf_cpu_map *cpu_map__new_data(struct perf_record_cpu_map_data *data);
 size_t cpu_map__snprint(struct perf_cpu_map *map, char *buf, size_t size);
 size_t cpu_map__snprint_mask(struct perf_cpu_map *map, char *buf, size_t size);
 size_t cpu_map__fprintf(struct perf_cpu_map *map, FILE *fp);
-struct aggr_cpu_id cpu_map__get_socket_aggr_by_cpu(int cpu, void *data);
-struct aggr_cpu_id cpu_map__get_die_aggr_by_cpu(int cpu, void *data);
-struct aggr_cpu_id cpu_map__get_core_aggr_by_cpu(int cpu, void *data);
-struct aggr_cpu_id cpu_map__get_node_aggr_by_cpu(int cpu, void *data);
 const struct perf_cpu_map *cpu_map__online(void); /* thread unsafe */
 
 int cpu__setup_cpunode_map(void);
@@ -88,4 +84,29 @@ bool aggr_cpu_id__equal(const struct aggr_cpu_id *a, const struct aggr_cpu_id *b
 bool aggr_cpu_id__is_empty(const struct aggr_cpu_id *a);
 struct aggr_cpu_id aggr_cpu_id__empty(void);
 
+
+/**
+ * aggr_cpu_id__socket - Create an aggr_cpu_id with the socket populated with
+ * the socket for cpu. The function signature is compatible with
+ * aggr_cpu_id_get_t.
+ */
+struct aggr_cpu_id aggr_cpu_id__socket(int cpu, void *data);
+/**
+ * aggr_cpu_id__die - Create an aggr_cpu_id with the die and socket populated
+ * with the die and socket for cpu. The function signature is compatible with
+ * aggr_cpu_id_get_t.
+ */
+struct aggr_cpu_id aggr_cpu_id__die(int cpu, void *data);
+/**
+ * aggr_cpu_id__core - Create an aggr_cpu_id with the core, die and socket
+ * populated with the core, die and socket for cpu. The function signature is
+ * compatible with aggr_cpu_id_get_t.
+ */
+struct aggr_cpu_id aggr_cpu_id__core(int cpu, void *data);
+/**
+ * aggr_cpu_id__node - Create an aggr_cpu_id with the numa node populated for
+ * cpu. The function signature is compatible with aggr_cpu_id_get_t.
+ */
+struct aggr_cpu_id aggr_cpu_id__node(int cpu, void *data);
+
 #endif /* __PERF_CPUMAP_H */
-- 
cgit v1.2.3


From dfc66beff7fa95b9eb507ccb48fb325569bc2f74 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:23 -0800
Subject: perf cpumap: Move 'has' function to libperf

Make the cpu map argument const for consistency with the rest of the
API. Modify cpu_map__idx accordingly.

Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-21-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/perf/Documentation/libperf.txt |  1 +
 tools/lib/perf/cpumap.c                  |  7 ++++++-
 tools/lib/perf/include/internal/cpumap.h |  2 +-
 tools/lib/perf/include/perf/cpumap.h     |  1 +
 tools/lib/perf/libperf.map               |  1 +
 tools/perf/arch/arm/util/cs-etm.c        | 16 ++++++++--------
 tools/perf/builtin-sched.c               |  6 +++---
 tools/perf/tests/topology.c              |  2 +-
 tools/perf/util/cpumap.c                 |  5 -----
 tools/perf/util/cpumap.h                 |  2 --
 tools/perf/util/cputopo.c                |  2 +-
 11 files changed, 23 insertions(+), 22 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/perf/Documentation/libperf.txt b/tools/lib/perf/Documentation/libperf.txt
index 63ae5e0195ce..faef9ba3a540 100644
--- a/tools/lib/perf/Documentation/libperf.txt
+++ b/tools/lib/perf/Documentation/libperf.txt
@@ -48,6 +48,7 @@ SYNOPSIS
   int perf_cpu_map__nr(const struct perf_cpu_map *cpus);
   bool perf_cpu_map__empty(const struct perf_cpu_map *map);
   int perf_cpu_map__max(struct perf_cpu_map *map);
+  bool perf_cpu_map__has(const struct perf_cpu_map *map, int cpu);
 
   #define perf_cpu_map__for_each_cpu(cpu, idx, cpus)
 --
diff --git a/tools/lib/perf/cpumap.c b/tools/lib/perf/cpumap.c
index adaad3dddf6e..3c36a06771af 100644
--- a/tools/lib/perf/cpumap.c
+++ b/tools/lib/perf/cpumap.c
@@ -268,7 +268,7 @@ bool perf_cpu_map__empty(const struct perf_cpu_map *map)
 	return map ? map->map[0] == -1 : true;
 }
 
-int perf_cpu_map__idx(struct perf_cpu_map *cpus, int cpu)
+int perf_cpu_map__idx(const struct perf_cpu_map *cpus, int cpu)
 {
 	int low = 0, high = cpus->nr;
 
@@ -288,6 +288,11 @@ int perf_cpu_map__idx(struct perf_cpu_map *cpus, int cpu)
 	return -1;
 }
 
+bool perf_cpu_map__has(const struct perf_cpu_map *cpus, int cpu)
+{
+	return perf_cpu_map__idx(cpus, cpu) != -1;
+}
+
 int perf_cpu_map__max(struct perf_cpu_map *map)
 {
 	// cpu_map__trim_new() qsort()s it, cpu_map__default_new() sorts it as well.
diff --git a/tools/lib/perf/include/internal/cpumap.h b/tools/lib/perf/include/internal/cpumap.h
index 4054169c12c5..71a31ed738c9 100644
--- a/tools/lib/perf/include/internal/cpumap.h
+++ b/tools/lib/perf/include/internal/cpumap.h
@@ -23,6 +23,6 @@ struct perf_cpu_map {
 #define MAX_NR_CPUS	2048
 #endif
 
-int perf_cpu_map__idx(struct perf_cpu_map *cpus, int cpu);
+int perf_cpu_map__idx(const struct perf_cpu_map *cpus, int cpu);
 
 #endif /* __LIBPERF_INTERNAL_CPUMAP_H */
diff --git a/tools/lib/perf/include/perf/cpumap.h b/tools/lib/perf/include/perf/cpumap.h
index 7c27766ea0bf..3f1c0afa3ccd 100644
--- a/tools/lib/perf/include/perf/cpumap.h
+++ b/tools/lib/perf/include/perf/cpumap.h
@@ -20,6 +20,7 @@ LIBPERF_API int perf_cpu_map__cpu(const struct perf_cpu_map *cpus, int idx);
 LIBPERF_API int perf_cpu_map__nr(const struct perf_cpu_map *cpus);
 LIBPERF_API bool perf_cpu_map__empty(const struct perf_cpu_map *map);
 LIBPERF_API int perf_cpu_map__max(struct perf_cpu_map *map);
+LIBPERF_API bool perf_cpu_map__has(const struct perf_cpu_map *map, int cpu);
 
 #define perf_cpu_map__for_each_cpu(cpu, idx, cpus)		\
 	for ((idx) = 0, (cpu) = perf_cpu_map__cpu(cpus, idx);	\
diff --git a/tools/lib/perf/libperf.map b/tools/lib/perf/libperf.map
index 5979bf92d98f..93696affda2e 100644
--- a/tools/lib/perf/libperf.map
+++ b/tools/lib/perf/libperf.map
@@ -10,6 +10,7 @@ LIBPERF_0.0.1 {
 		perf_cpu_map__cpu;
 		perf_cpu_map__empty;
 		perf_cpu_map__max;
+		perf_cpu_map__has;
 		perf_thread_map__new_dummy;
 		perf_thread_map__set_pid;
 		perf_thread_map__comm;
diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c
index 8a3d54a86c9c..129c0272d65b 100644
--- a/tools/perf/arch/arm/util/cs-etm.c
+++ b/tools/perf/arch/arm/util/cs-etm.c
@@ -204,8 +204,8 @@ static int cs_etm_set_option(struct auxtrace_record *itr,
 
 	/* Set option of each CPU we have */
 	for (i = 0; i < cpu__max_cpu(); i++) {
-		if (!cpu_map__has(event_cpus, i) ||
-		    !cpu_map__has(online_cpus, i))
+		if (!perf_cpu_map__has(event_cpus, i) ||
+		    !perf_cpu_map__has(online_cpus, i))
 			continue;
 
 		if (option & BIT(ETM_OPT_CTXTID)) {
@@ -523,8 +523,8 @@ cs_etm_info_priv_size(struct auxtrace_record *itr __maybe_unused,
 	/* cpu map is not empty, we have specific CPUs to work with */
 	if (!perf_cpu_map__empty(event_cpus)) {
 		for (i = 0; i < cpu__max_cpu(); i++) {
-			if (!cpu_map__has(event_cpus, i) ||
-			    !cpu_map__has(online_cpus, i))
+			if (!perf_cpu_map__has(event_cpus, i) ||
+			    !perf_cpu_map__has(online_cpus, i))
 				continue;
 
 			if (cs_etm_is_ete(itr, i))
@@ -537,7 +537,7 @@ cs_etm_info_priv_size(struct auxtrace_record *itr __maybe_unused,
 	} else {
 		/* get configuration for all CPUs in the system */
 		for (i = 0; i < cpu__max_cpu(); i++) {
-			if (!cpu_map__has(online_cpus, i))
+			if (!perf_cpu_map__has(online_cpus, i))
 				continue;
 
 			if (cs_etm_is_ete(itr, i))
@@ -722,8 +722,8 @@ static int cs_etm_info_fill(struct auxtrace_record *itr,
 	} else {
 		/* Make sure all specified CPUs are online */
 		for (i = 0; i < perf_cpu_map__nr(event_cpus); i++) {
-			if (cpu_map__has(event_cpus, i) &&
-			    !cpu_map__has(online_cpus, i))
+			if (perf_cpu_map__has(event_cpus, i) &&
+			    !perf_cpu_map__has(online_cpus, i))
 				return -EINVAL;
 		}
 
@@ -744,7 +744,7 @@ static int cs_etm_info_fill(struct auxtrace_record *itr,
 	offset = CS_ETM_SNAPSHOT + 1;
 
 	for (i = 0; i < cpu__max_cpu() && offset < priv_size; i++)
-		if (cpu_map__has(cpu_map, i))
+		if (perf_cpu_map__has(cpu_map, i))
 			cs_etm_get_metadata(i, &offset, itr, info);
 
 	perf_cpu_map__put(online_cpus);
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 4527f632ebe4..9da1da4749c9 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1617,10 +1617,10 @@ static int map_switch_event(struct perf_sched *sched, struct evsel *evsel,
 		if (curr_thread && thread__has_color(curr_thread))
 			pid_color = COLOR_PIDS;
 
-		if (sched->map.cpus && !cpu_map__has(sched->map.cpus, cpu))
+		if (sched->map.cpus && !perf_cpu_map__has(sched->map.cpus, cpu))
 			continue;
 
-		if (sched->map.color_cpus && cpu_map__has(sched->map.color_cpus, cpu))
+		if (sched->map.color_cpus && perf_cpu_map__has(sched->map.color_cpus, cpu))
 			cpu_color = COLOR_CPUS;
 
 		if (cpu != this_cpu)
@@ -1639,7 +1639,7 @@ static int map_switch_event(struct perf_sched *sched, struct evsel *evsel,
 			color_fprintf(stdout, color, "   ");
 	}
 
-	if (sched->map.cpus && !cpu_map__has(sched->map.cpus, this_cpu))
+	if (sched->map.cpus && !perf_cpu_map__has(sched->map.cpus, this_cpu))
 		goto out;
 
 	timestamp__scnprintf_usec(timestamp, stimestamp, sizeof(stimestamp));
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 0cb7b015b4b9..cb29ea7ec409 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -112,7 +112,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map)
 	TEST_ASSERT_VAL("Session header CPU map not set", session->header.env.cpu);
 
 	for (i = 0; i < session->header.env.nr_cpus_avail; i++) {
-		if (!cpu_map__has(map, i))
+		if (!perf_cpu_map__has(map, i))
 			continue;
 		pr_debug("CPU %d, core %d, socket %d\n", i,
 			 session->header.env.cpu[i].core_id,
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 19e502cc65e7..f1d76a8e92e8 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -463,11 +463,6 @@ int cpu__setup_cpunode_map(void)
 	return 0;
 }
 
-bool cpu_map__has(struct perf_cpu_map *cpus, int cpu)
-{
-	return perf_cpu_map__idx(cpus, cpu) != -1;
-}
-
 size_t cpu_map__snprint(struct perf_cpu_map *map, char *buf, size_t size)
 {
 	int i, cpu, start = -1;
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index ecd658293a2d..32b8b5178f01 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -78,8 +78,6 @@ struct cpu_aggr_map *cpu_aggr_map__new(const struct perf_cpu_map *cpus,
 				       aggr_cpu_id_get_t get_id,
 				       void *data);
 
-bool cpu_map__has(struct perf_cpu_map *cpus, int cpu);
-
 bool aggr_cpu_id__equal(const struct aggr_cpu_id *a, const struct aggr_cpu_id *b);
 bool aggr_cpu_id__is_empty(const struct aggr_cpu_id *a);
 struct aggr_cpu_id aggr_cpu_id__empty(void);
diff --git a/tools/perf/util/cputopo.c b/tools/perf/util/cputopo.c
index 51b429c86f98..8affb37d90e7 100644
--- a/tools/perf/util/cputopo.c
+++ b/tools/perf/util/cputopo.c
@@ -218,7 +218,7 @@ struct cpu_topology *cpu_topology__new(void)
 	tp->core_cpus_list = addr;
 
 	for (i = 0; i < nr; i++) {
-		if (!cpu_map__has(map, i))
+		if (!perf_cpu_map__has(map, i))
 			continue;
 
 		ret = build_cpu_topology(tp, i);
-- 
cgit v1.2.3


From 92aad5c33f531187cc6013c8e51620212cdfefe1 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:24 -0800
Subject: perf cpumap: Add some comments to cpu_aggr_map

Move cpu_aggr_map__empty_new() to be with other cpu_aggr_map function.

Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-22-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/cpumap.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 32b8b5178f01..25a08d640d81 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -24,16 +24,18 @@ struct aggr_cpu_id {
 	int core;
 };
 
+/** A collection of aggr_cpu_id values, the "built" version is sorted and uniqued. */
 struct cpu_aggr_map {
 	refcount_t refcnt;
+	/** Number of valid entries. */
 	int nr;
+	/** The entries. */
 	struct aggr_cpu_id map[];
 };
 
 struct perf_record_cpu_map_data;
 
 struct perf_cpu_map *perf_cpu_map__empty_new(int nr);
-struct cpu_aggr_map *cpu_aggr_map__empty_new(int nr);
 
 struct perf_cpu_map *cpu_map__new_data(struct perf_record_cpu_map_data *data);
 size_t cpu_map__snprint(struct perf_cpu_map *map, char *buf, size_t size);
@@ -67,6 +69,12 @@ int cpu__get_die_id(int cpu);
  */
 int cpu__get_core_id(int cpu);
 
+/**
+ * cpu_aggr_map__empty_new - Create a cpu_aggr_map of size nr with every entry
+ * being empty.
+ */
+struct cpu_aggr_map *cpu_aggr_map__empty_new(int nr);
+
 typedef struct aggr_cpu_id (*aggr_cpu_id_get_t)(int cpu, void *data);
 
 /**
-- 
cgit v1.2.3


From bd26bddfd93688d10984251249b84e1f6d91de27 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:25 -0800
Subject: perf cpumap: Trim the cpu_aggr_map

cpu_aggr_map__new() removes duplicates, when this happens shrink the
array.

Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-23-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/cpumap.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index f1d76a8e92e8..2779474f39db 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -185,7 +185,15 @@ struct cpu_aggr_map *cpu_aggr_map__new(const struct perf_cpu_map *cpus,
 			c->nr++;
 		}
 	}
-
+	/* Trim. */
+	if (c->nr != cpus->nr) {
+		struct cpu_aggr_map *trimmed_c =
+			realloc(c,
+				sizeof(struct cpu_aggr_map) + sizeof(struct aggr_cpu_id) * c->nr);
+
+		if (trimmed_c)
+			c = trimmed_c;
+	}
 	/* ensure we process id in increasing order */
 	qsort(c->map, c->nr, sizeof(struct aggr_cpu_id), aggr_cpu_id__cmp);
 
-- 
cgit v1.2.3


From f9e891ea172235f902972069b87be3bdc7c48f5a Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:26 -0800
Subject: perf stat: Fix memory leak in check_per_pkg()

If the key is already present then free the key used for lookup.

Found with:
$ perf stat -M IO_Read_BW /bin/true

==1749112==ERROR: LeakSanitizer: detected memory leaks

Direct leak of 32 byte(s) in 4 object(s) allocated from:
    #0 0x7f6f6fa7d7cf in __interceptor_malloc ../../../../src/libsanitizer/asan/asan_malloc_linux.cpp:145
    #1 0x55acecd9d7a6 in check_per_pkg util/stat.c:343
    #2 0x55acecd9d9c5 in process_counter_values util/stat.c:365
    #3 0x55acecd9e0ab in process_counter_maps util/stat.c:421
    #4 0x55acecd9e292 in perf_stat_process_counter util/stat.c:443
    #5 0x55aceca8553e in read_counters ./tools/perf/builtin-stat.c:470
    #6 0x55aceca88fe3 in __run_perf_stat ./tools/perf/builtin-stat.c:1023
    #7 0x55aceca89146 in run_perf_stat ./tools/perf/builtin-stat.c:1048
    #8 0x55aceca90858 in cmd_stat ./tools/perf/builtin-stat.c:2555
    #9 0x55acecc05fa5 in run_builtin ./tools/perf/perf.c:313
    #10 0x55acecc064fe in handle_internal_command ./tools/perf/perf.c:365
    #11 0x55acecc068bb in run_argv ./tools/perf/perf.c:409
    #12 0x55acecc070aa in main ./tools/perf/perf.c:539

Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-24-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/stat.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index 5c24aca0968c..c69b221f5e3e 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -345,9 +345,10 @@ static int check_per_pkg(struct evsel *counter,
 		return -ENOMEM;
 
 	*key = (uint64_t)d << 32 | s;
-	if (hashmap__find(mask, (void *)key, NULL))
+	if (hashmap__find(mask, (void *)key, NULL)) {
 		*skip = true;
-	else
+		free(key);
+	} else
 		ret = hashmap__add(mask, (void *)key, (void *)1);
 
 	return ret;
-- 
cgit v1.2.3


From 34794913e2dc08a464499f795073a021feeb3b47 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:27 -0800
Subject: perf cpumap: Add CPU to aggr_cpu_id

With no aggregration, such as 'perf stat -A', the aggr_cpu_id lacks a
way to describe per CPU aggregation and the core is set to the CPU in
places like print_counter_aggrdata in stat-display.c.

Setting the core to the CPU is undesirable as the CPU will exceed valid
core values and lead to confusion. Add a CPU variable to address this.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-25-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/topology.c | 19 +++++++++++++++++++
 tools/perf/util/cpumap.c    | 25 +++++++++++++++++++++----
 tools/perf/util/cpumap.h    |  8 ++++++++
 3 files changed, 48 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index cb29ea7ec409..33e4cb81265c 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -119,6 +119,22 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map)
 			 session->header.env.cpu[i].socket_id);
 	}
 
+	// Test that CPU ID contains socket, die, core and CPU
+	for (i = 0; i < map->nr; i++) {
+		id = aggr_cpu_id__cpu(perf_cpu_map__cpu(map, i), NULL);
+		TEST_ASSERT_VAL("Cpu map - CPU ID doesn't match", map->map[i] == id.cpu);
+
+		TEST_ASSERT_VAL("Cpu map - Core ID doesn't match",
+			session->header.env.cpu[map->map[i]].core_id == id.core);
+		TEST_ASSERT_VAL("Cpu map - Socket ID doesn't match",
+			session->header.env.cpu[map->map[i]].socket_id == id.socket);
+
+		TEST_ASSERT_VAL("Cpu map - Die ID doesn't match",
+			session->header.env.cpu[map->map[i]].die_id == id.die);
+		TEST_ASSERT_VAL("Cpu map - Node ID is set", id.node == -1);
+		TEST_ASSERT_VAL("Cpu map - Thread is set", id.thread == -1);
+	}
+
 	// Test that core ID contains socket, die and core
 	for (i = 0; i < map->nr; i++) {
 		id = aggr_cpu_id__core(perf_cpu_map__cpu(map, i), NULL);
@@ -145,6 +161,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map)
 
 		TEST_ASSERT_VAL("Die map - Node ID is set", id.node == -1);
 		TEST_ASSERT_VAL("Die map - Core is set", id.core == -1);
+		TEST_ASSERT_VAL("Die map - CPU is set", id.cpu == -1);
 		TEST_ASSERT_VAL("Die map - Thread is set", id.thread == -1);
 	}
 
@@ -157,6 +174,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map)
 		TEST_ASSERT_VAL("Socket map - Node ID is set", id.node == -1);
 		TEST_ASSERT_VAL("Socket map - Die ID is set", id.die == -1);
 		TEST_ASSERT_VAL("Socket map - Core is set", id.core == -1);
+		TEST_ASSERT_VAL("Socket map - CPU is set", id.cpu == -1);
 		TEST_ASSERT_VAL("Socket map - Thread is set", id.thread == -1);
 	}
 
@@ -168,6 +186,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map)
 		TEST_ASSERT_VAL("Node map - Socket is set", id.socket == -1);
 		TEST_ASSERT_VAL("Node map - Die ID is set", id.die == -1);
 		TEST_ASSERT_VAL("Node map - Core is set", id.core == -1);
+		TEST_ASSERT_VAL("Node map - CPU is set", id.cpu == -1);
 		TEST_ASSERT_VAL("Node map - Thread is set", id.thread == -1);
 	}
 	perf_session__delete(session);
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 2779474f39db..48ce583af0ec 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -242,7 +242,7 @@ struct aggr_cpu_id aggr_cpu_id__core(int cpu, void *data)
 	struct aggr_cpu_id id;
 	int core = cpu__get_core_id(cpu);
 
-	/* aggr_cpu_id__die returns a struct with socket and die set*/
+	/* aggr_cpu_id__die returns a struct with socket and die set. */
 	id = aggr_cpu_id__die(cpu, data);
 	if (aggr_cpu_id__is_empty(&id))
 		return id;
@@ -256,6 +256,20 @@ struct aggr_cpu_id aggr_cpu_id__core(int cpu, void *data)
 
 }
 
+struct aggr_cpu_id aggr_cpu_id__cpu(int cpu, void *data)
+{
+	struct aggr_cpu_id id;
+
+	/* aggr_cpu_id__core returns a struct with socket, die and core set. */
+	id = aggr_cpu_id__core(cpu, data);
+	if (aggr_cpu_id__is_empty(&id))
+		return id;
+
+	id.cpu = cpu;
+	return id;
+
+}
+
 struct aggr_cpu_id aggr_cpu_id__node(int cpu, void *data __maybe_unused)
 {
 	struct aggr_cpu_id id = aggr_cpu_id__empty();
@@ -579,7 +593,8 @@ bool aggr_cpu_id__equal(const struct aggr_cpu_id *a, const struct aggr_cpu_id *b
 		a->node == b->node &&
 		a->socket == b->socket &&
 		a->die == b->die &&
-		a->core == b->core;
+		a->core == b->core &&
+		a->cpu == b->cpu;
 }
 
 bool aggr_cpu_id__is_empty(const struct aggr_cpu_id *a)
@@ -588,7 +603,8 @@ bool aggr_cpu_id__is_empty(const struct aggr_cpu_id *a)
 		a->node == -1 &&
 		a->socket == -1 &&
 		a->die == -1 &&
-		a->core == -1;
+		a->core == -1 &&
+		a->cpu == -1;
 }
 
 struct aggr_cpu_id aggr_cpu_id__empty(void)
@@ -598,7 +614,8 @@ struct aggr_cpu_id aggr_cpu_id__empty(void)
 		.node = -1,
 		.socket = -1,
 		.die = -1,
-		.core = -1
+		.core = -1,
+		.cpu = -1
 	};
 	return ret;
 }
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 25a08d640d81..b98cd1739677 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -22,6 +22,8 @@ struct aggr_cpu_id {
 	int die;
 	/** The core id as read from /sys/devices/system/cpu/cpuX/topology/core_id. */
 	int core;
+	/** CPU aggregation, note there is one CPU for each SMT thread. */
+	int cpu;
 };
 
 /** A collection of aggr_cpu_id values, the "built" version is sorted and uniqued. */
@@ -109,6 +111,12 @@ struct aggr_cpu_id aggr_cpu_id__die(int cpu, void *data);
  * compatible with aggr_cpu_id_get_t.
  */
 struct aggr_cpu_id aggr_cpu_id__core(int cpu, void *data);
+/**
+ * aggr_cpu_id__core - Create an aggr_cpu_id with the cpu, core, die and socket
+ * populated with the cpu, core, die and socket for cpu. The function signature
+ * is compatible with aggr_cpu_id_get_t.
+ */
+struct aggr_cpu_id aggr_cpu_id__cpu(int cpu, void *data);
 /**
  * aggr_cpu_id__node - Create an aggr_cpu_id with the numa node populated for
  * cpu. The function signature is compatible with aggr_cpu_id_get_t.
-- 
cgit v1.2.3


From 7365f105e37429d28757f7f68d4850723ce18aa1 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:28 -0800
Subject: perf stat-display: Avoid use of core for CPU

Correct use of cpumap index in print_no_aggr_metric().

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-26-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/stat-display.c | 45 +++++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 23 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index 870b1db71fbc..f48d1678861c 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -120,11 +120,10 @@ static void aggr_printout(struct perf_stat_config *config,
 				id.die,
 				config->csv_output ? 0 : -3,
 				id.core, config->csv_sep);
-		} else if (id.core > -1) {
+		} else if (id.cpu > -1) {
 			fprintf(config->output, "CPU%*d%s",
 				config->csv_output ? 0 : -7,
-				evsel__cpus(evsel)->map[id.core],
-				config->csv_sep);
+				id.cpu, config->csv_sep);
 		}
 		break;
 	case AGGR_THREAD:
@@ -334,7 +333,7 @@ static int first_shadow_cpu(struct perf_stat_config *config,
 	int cpu, idx;
 
 	if (config->aggr_mode == AGGR_NONE)
-		return id->core;
+		return id->cpu;
 
 	if (!config->aggr_get_id)
 		return 0;
@@ -697,10 +696,9 @@ static void print_counter_aggrdata(struct perf_stat_config *config,
 		fprintf(output, "%s", prefix);
 
 	uval = val * counter->scale;
-	if (cpu != -1) {
-		id = aggr_cpu_id__empty();
-		id.core = cpu;
-	}
+	if (cpu != -1)
+		id = aggr_cpu_id__cpu(cpu, /*data=*/NULL);
+
 	printout(config, id, nr, counter, uval,
 		 prefix, run, ena, 1.0, &rt_stat);
 	if (!metric_only)
@@ -911,8 +909,7 @@ static void print_counter(struct perf_stat_config *config,
 			fprintf(output, "%s", prefix);
 
 		uval = val * counter->scale;
-		id = aggr_cpu_id__empty();
-		id.core = cpu;
+		id = aggr_cpu_id__cpu(cpu, /*data=*/NULL);
 		printout(config, id, 0, counter, uval, prefix,
 			 run, ena, 1.0, &rt_stat);
 
@@ -924,29 +921,31 @@ static void print_no_aggr_metric(struct perf_stat_config *config,
 				 struct evlist *evlist,
 				 char *prefix)
 {
-	int cpu;
-	int nrcpus = 0;
-	struct evsel *counter;
-	u64 ena, run, val;
-	double uval;
-	struct aggr_cpu_id id;
+	int all_idx, cpu;
 
-	nrcpus = evlist->core.cpus->nr;
-	for (cpu = 0; cpu < nrcpus; cpu++) {
+	perf_cpu_map__for_each_cpu(cpu, all_idx, evlist->core.cpus) {
+		struct evsel *counter;
 		bool first = true;
 
 		if (prefix)
 			fputs(prefix, config->output);
 		evlist__for_each_entry(evlist, counter) {
-			id = aggr_cpu_id__empty();
-			id.core = cpu;
+			u64 ena, run, val;
+			double uval;
+			struct aggr_cpu_id id;
+			int counter_idx = perf_cpu_map__idx(evsel__cpus(counter), cpu);
+
+			if (counter_idx < 0)
+				continue;
+
+			id = aggr_cpu_id__cpu(cpu, /*data=*/NULL);
 			if (first) {
 				aggr_printout(config, counter, id, 0);
 				first = false;
 			}
-			val = perf_counts(counter->counts, cpu, 0)->val;
-			ena = perf_counts(counter->counts, cpu, 0)->ena;
-			run = perf_counts(counter->counts, cpu, 0)->run;
+			val = perf_counts(counter->counts, counter_idx, 0)->val;
+			ena = perf_counts(counter->counts, counter_idx, 0)->ena;
+			run = perf_counts(counter->counts, counter_idx, 0)->run;
 
 			uval = val * counter->scale;
 			printout(config, id, 0, counter, uval, prefix,
-- 
cgit v1.2.3


From 2ca0a3718da24953689b1771589ac63b60f17358 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:29 -0800
Subject: perf evsel: Derive CPUs and threads in alloc_counts

Passing the number of CPUs and threads allows for an evsel's counts to
be mismatched to its cpu map. To avoid this always derive the counts
size from the cpu map. Change openat-syscall-all-cpus to set the cpus
to allow for this to work.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-27-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/openat-syscall-all-cpus.c | 10 +---------
 tools/perf/util/counts.c                   |  8 ++++++--
 tools/perf/util/counts.h                   |  2 +-
 tools/perf/util/evsel.c                    |  2 +-
 tools/perf/util/stat.c                     | 13 ++++++-------
 5 files changed, 15 insertions(+), 20 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/openat-syscall-all-cpus.c b/tools/perf/tests/openat-syscall-all-cpus.c
index cd3dd463783f..544db0839b3b 100644
--- a/tools/perf/tests/openat-syscall-all-cpus.c
+++ b/tools/perf/tests/openat-syscall-all-cpus.c
@@ -85,15 +85,7 @@ static int test__openat_syscall_event_on_all_cpus(struct test_suite *test __mayb
 		CPU_CLR(cpus->map[cpu], &cpu_set);
 	}
 
-	/*
-	 * Here we need to explicitly preallocate the counts, as if
-	 * we use the auto allocation it will allocate just for 1 cpu,
-	 * as we start by cpu 0.
-	 */
-	if (evsel__alloc_counts(evsel, cpus->nr, 1) < 0) {
-		pr_debug("evsel__alloc_counts(ncpus=%d)\n", cpus->nr);
-		goto out_close_fd;
-	}
+	evsel->core.cpus = perf_cpu_map__get(cpus);
 
 	err = 0;
 
diff --git a/tools/perf/util/counts.c b/tools/perf/util/counts.c
index 582f3aeaf5e4..2b81707b9dba 100644
--- a/tools/perf/util/counts.c
+++ b/tools/perf/util/counts.c
@@ -4,6 +4,7 @@
 #include <string.h>
 #include "evsel.h"
 #include "counts.h"
+#include <perf/threadmap.h>
 #include <linux/zalloc.h>
 
 struct perf_counts *perf_counts__new(int ncpus, int nthreads)
@@ -55,9 +56,12 @@ void evsel__reset_counts(struct evsel *evsel)
 	perf_counts__reset(evsel->counts);
 }
 
-int evsel__alloc_counts(struct evsel *evsel, int ncpus, int nthreads)
+int evsel__alloc_counts(struct evsel *evsel)
 {
-	evsel->counts = perf_counts__new(ncpus, nthreads);
+	struct perf_cpu_map *cpus = evsel__cpus(evsel);
+	int nthreads = perf_thread_map__nr(evsel->core.threads);
+
+	evsel->counts = perf_counts__new(cpus ? cpus->nr : 1, nthreads);
 	return evsel->counts != NULL ? 0 : -ENOMEM;
 }
 
diff --git a/tools/perf/util/counts.h b/tools/perf/util/counts.h
index 7ff36bf6d644..3e275e9c60d1 100644
--- a/tools/perf/util/counts.h
+++ b/tools/perf/util/counts.h
@@ -40,7 +40,7 @@ void perf_counts__delete(struct perf_counts *counts);
 void perf_counts__reset(struct perf_counts *counts);
 
 void evsel__reset_counts(struct evsel *evsel);
-int evsel__alloc_counts(struct evsel *evsel, int ncpus, int nthreads);
+int evsel__alloc_counts(struct evsel *evsel);
 void evsel__free_counts(struct evsel *evsel);
 
 #endif /* __PERF_COUNTS_H */
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index a0acf53a2510..2de569a1a272 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1578,7 +1578,7 @@ int __evsel__read_on_cpu(struct evsel *evsel, int cpu, int thread, bool scale)
 	if (FD(evsel, cpu, thread) < 0)
 		return -EINVAL;
 
-	if (evsel->counts == NULL && evsel__alloc_counts(evsel, cpu + 1, thread + 1) < 0)
+	if (evsel->counts == NULL && evsel__alloc_counts(evsel) < 0)
 		return -ENOMEM;
 
 	if (readn(FD(evsel, cpu, thread), &count, nv * sizeof(u64)) <= 0)
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index c69b221f5e3e..995cb5003133 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -152,11 +152,13 @@ static void evsel__free_stat_priv(struct evsel *evsel)
 	zfree(&evsel->stats);
 }
 
-static int evsel__alloc_prev_raw_counts(struct evsel *evsel, int ncpus, int nthreads)
+static int evsel__alloc_prev_raw_counts(struct evsel *evsel)
 {
+	int cpu_map_nr = evsel__nr_cpus(evsel);
+	int nthreads = perf_thread_map__nr(evsel->core.threads);
 	struct perf_counts *counts;
 
-	counts = perf_counts__new(ncpus, nthreads);
+	counts = perf_counts__new(cpu_map_nr, nthreads);
 	if (counts)
 		evsel->prev_raw_counts = counts;
 
@@ -177,12 +179,9 @@ static void evsel__reset_prev_raw_counts(struct evsel *evsel)
 
 static int evsel__alloc_stats(struct evsel *evsel, bool alloc_raw)
 {
-	int ncpus = evsel__nr_cpus(evsel);
-	int nthreads = perf_thread_map__nr(evsel->core.threads);
-
 	if (evsel__alloc_stat_priv(evsel) < 0 ||
-	    evsel__alloc_counts(evsel, ncpus, nthreads) < 0 ||
-	    (alloc_raw && evsel__alloc_prev_raw_counts(evsel, ncpus, nthreads) < 0))
+	    evsel__alloc_counts(evsel) < 0 ||
+	    (alloc_raw && evsel__alloc_prev_raw_counts(evsel) < 0))
 		return -ENOMEM;
 
 	return 0;
-- 
cgit v1.2.3


From 7e3d1784c8a4d9c643a6ed0a2c44ee94dee8f7a6 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:30 -0800
Subject: libperf: Switch cpu to more accurate cpu_map_idx

Modify variable names and adopt perf_cpu_map__for_each_cpu() in
perf_evsel__open().

Renaming is done by looking for consistency in API usage.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-28-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/perf/evsel.c              | 88 +++++++++++++++++++------------------
 tools/lib/perf/include/perf/evsel.h | 10 ++---
 2 files changed, 50 insertions(+), 48 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/perf/evsel.c b/tools/lib/perf/evsel.c
index 68f83d2c27c1..8028b5a4da69 100644
--- a/tools/lib/perf/evsel.c
+++ b/tools/lib/perf/evsel.c
@@ -43,18 +43,22 @@ void perf_evsel__delete(struct perf_evsel *evsel)
 	free(evsel);
 }
 
-#define FD(e, x, y) ((int *) xyarray__entry(e->fd, x, y))
-#define MMAP(e, x, y) (e->mmap ? ((struct perf_mmap *) xyarray__entry(e->mmap, x, y)) : NULL)
+#define FD(_evsel, _cpu_map_idx, _thread)				\
+	((int *)xyarray__entry(_evsel->fd, _cpu_map_idx, _thread))
+#define MMAP(_evsel, _cpu_map_idx, _thread)				\
+	(_evsel->mmap ? ((struct perf_mmap *) xyarray__entry(_evsel->mmap, _cpu_map_idx, _thread)) \
+		      : NULL)
 
 int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
 {
 	evsel->fd = xyarray__new(ncpus, nthreads, sizeof(int));
 
 	if (evsel->fd) {
-		int cpu, thread;
-		for (cpu = 0; cpu < ncpus; cpu++) {
+		int idx, thread;
+
+		for (idx = 0; idx < ncpus; idx++) {
 			for (thread = 0; thread < nthreads; thread++) {
-				int *fd = FD(evsel, cpu, thread);
+				int *fd = FD(evsel, idx, thread);
 
 				if (fd)
 					*fd = -1;
@@ -80,7 +84,7 @@ sys_perf_event_open(struct perf_event_attr *attr,
 	return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags);
 }
 
-static int get_group_fd(struct perf_evsel *evsel, int cpu, int thread, int *group_fd)
+static int get_group_fd(struct perf_evsel *evsel, int cpu_map_idx, int thread, int *group_fd)
 {
 	struct perf_evsel *leader = evsel->leader;
 	int *fd;
@@ -97,7 +101,7 @@ static int get_group_fd(struct perf_evsel *evsel, int cpu, int thread, int *grou
 	if (!leader->fd)
 		return -ENOTCONN;
 
-	fd = FD(leader, cpu, thread);
+	fd = FD(leader, cpu_map_idx, thread);
 	if (fd == NULL || *fd == -1)
 		return -EBADF;
 
@@ -109,7 +113,7 @@ static int get_group_fd(struct perf_evsel *evsel, int cpu, int thread, int *grou
 int perf_evsel__open(struct perf_evsel *evsel, struct perf_cpu_map *cpus,
 		     struct perf_thread_map *threads)
 {
-	int cpu, thread, err = 0;
+	int cpu, idx, thread, err = 0;
 
 	if (cpus == NULL) {
 		static struct perf_cpu_map *empty_cpu_map;
@@ -139,21 +143,21 @@ int perf_evsel__open(struct perf_evsel *evsel, struct perf_cpu_map *cpus,
 	    perf_evsel__alloc_fd(evsel, cpus->nr, threads->nr) < 0)
 		return -ENOMEM;
 
-	for (cpu = 0; cpu < cpus->nr; cpu++) {
+	perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
 		for (thread = 0; thread < threads->nr; thread++) {
 			int fd, group_fd, *evsel_fd;
 
-			evsel_fd = FD(evsel, cpu, thread);
+			evsel_fd = FD(evsel, idx, thread);
 			if (evsel_fd == NULL)
 				return -EINVAL;
 
-			err = get_group_fd(evsel, cpu, thread, &group_fd);
+			err = get_group_fd(evsel, idx, thread, &group_fd);
 			if (err < 0)
 				return err;
 
 			fd = sys_perf_event_open(&evsel->attr,
 						 threads->map[thread].pid,
-						 cpus->map[cpu], group_fd, 0);
+						 cpu, group_fd, 0);
 
 			if (fd < 0)
 				return -errno;
@@ -165,12 +169,12 @@ int perf_evsel__open(struct perf_evsel *evsel, struct perf_cpu_map *cpus,
 	return err;
 }
 
-static void perf_evsel__close_fd_cpu(struct perf_evsel *evsel, int cpu)
+static void perf_evsel__close_fd_cpu(struct perf_evsel *evsel, int cpu_map_idx)
 {
 	int thread;
 
 	for (thread = 0; thread < xyarray__max_y(evsel->fd); ++thread) {
-		int *fd = FD(evsel, cpu, thread);
+		int *fd = FD(evsel, cpu_map_idx, thread);
 
 		if (fd && *fd >= 0) {
 			close(*fd);
@@ -181,10 +185,8 @@ static void perf_evsel__close_fd_cpu(struct perf_evsel *evsel, int cpu)
 
 void perf_evsel__close_fd(struct perf_evsel *evsel)
 {
-	int cpu;
-
-	for (cpu = 0; cpu < xyarray__max_x(evsel->fd); cpu++)
-		perf_evsel__close_fd_cpu(evsel, cpu);
+	for (int idx = 0; idx < xyarray__max_x(evsel->fd); idx++)
+		perf_evsel__close_fd_cpu(evsel, idx);
 }
 
 void perf_evsel__free_fd(struct perf_evsel *evsel)
@@ -202,29 +204,29 @@ void perf_evsel__close(struct perf_evsel *evsel)
 	perf_evsel__free_fd(evsel);
 }
 
-void perf_evsel__close_cpu(struct perf_evsel *evsel, int cpu)
+void perf_evsel__close_cpu(struct perf_evsel *evsel, int cpu_map_idx)
 {
 	if (evsel->fd == NULL)
 		return;
 
-	perf_evsel__close_fd_cpu(evsel, cpu);
+	perf_evsel__close_fd_cpu(evsel, cpu_map_idx);
 }
 
 void perf_evsel__munmap(struct perf_evsel *evsel)
 {
-	int cpu, thread;
+	int idx, thread;
 
 	if (evsel->fd == NULL || evsel->mmap == NULL)
 		return;
 
-	for (cpu = 0; cpu < xyarray__max_x(evsel->fd); cpu++) {
+	for (idx = 0; idx < xyarray__max_x(evsel->fd); idx++) {
 		for (thread = 0; thread < xyarray__max_y(evsel->fd); thread++) {
-			int *fd = FD(evsel, cpu, thread);
+			int *fd = FD(evsel, idx, thread);
 
 			if (fd == NULL || *fd < 0)
 				continue;
 
-			perf_mmap__munmap(MMAP(evsel, cpu, thread));
+			perf_mmap__munmap(MMAP(evsel, idx, thread));
 		}
 	}
 
@@ -234,7 +236,7 @@ void perf_evsel__munmap(struct perf_evsel *evsel)
 
 int perf_evsel__mmap(struct perf_evsel *evsel, int pages)
 {
-	int ret, cpu, thread;
+	int ret, idx, thread;
 	struct perf_mmap_param mp = {
 		.prot = PROT_READ | PROT_WRITE,
 		.mask = (pages * page_size) - 1,
@@ -246,18 +248,18 @@ int perf_evsel__mmap(struct perf_evsel *evsel, int pages)
 	if (perf_evsel__alloc_mmap(evsel, xyarray__max_x(evsel->fd), xyarray__max_y(evsel->fd)) < 0)
 		return -ENOMEM;
 
-	for (cpu = 0; cpu < xyarray__max_x(evsel->fd); cpu++) {
+	for (idx = 0; idx < xyarray__max_x(evsel->fd); idx++) {
 		for (thread = 0; thread < xyarray__max_y(evsel->fd); thread++) {
-			int *fd = FD(evsel, cpu, thread);
+			int *fd = FD(evsel, idx, thread);
 			struct perf_mmap *map;
 
 			if (fd == NULL || *fd < 0)
 				continue;
 
-			map = MMAP(evsel, cpu, thread);
+			map = MMAP(evsel, idx, thread);
 			perf_mmap__init(map, NULL, false, NULL);
 
-			ret = perf_mmap__mmap(map, &mp, *fd, cpu);
+			ret = perf_mmap__mmap(map, &mp, *fd, idx);
 			if (ret) {
 				perf_evsel__munmap(evsel);
 				return ret;
@@ -268,14 +270,14 @@ int perf_evsel__mmap(struct perf_evsel *evsel, int pages)
 	return 0;
 }
 
-void *perf_evsel__mmap_base(struct perf_evsel *evsel, int cpu, int thread)
+void *perf_evsel__mmap_base(struct perf_evsel *evsel, int cpu_map_idx, int thread)
 {
-	int *fd = FD(evsel, cpu, thread);
+	int *fd = FD(evsel, cpu_map_idx, thread);
 
-	if (fd == NULL || *fd < 0 || MMAP(evsel, cpu, thread) == NULL)
+	if (fd == NULL || *fd < 0 || MMAP(evsel, cpu_map_idx, thread) == NULL)
 		return NULL;
 
-	return MMAP(evsel, cpu, thread)->base;
+	return MMAP(evsel, cpu_map_idx, thread)->base;
 }
 
 int perf_evsel__read_size(struct perf_evsel *evsel)
@@ -303,19 +305,19 @@ int perf_evsel__read_size(struct perf_evsel *evsel)
 	return size;
 }
 
-int perf_evsel__read(struct perf_evsel *evsel, int cpu, int thread,
+int perf_evsel__read(struct perf_evsel *evsel, int cpu_map_idx, int thread,
 		     struct perf_counts_values *count)
 {
 	size_t size = perf_evsel__read_size(evsel);
-	int *fd = FD(evsel, cpu, thread);
+	int *fd = FD(evsel, cpu_map_idx, thread);
 
 	memset(count, 0, sizeof(*count));
 
 	if (fd == NULL || *fd < 0)
 		return -EINVAL;
 
-	if (MMAP(evsel, cpu, thread) &&
-	    !perf_mmap__read_self(MMAP(evsel, cpu, thread), count))
+	if (MMAP(evsel, cpu_map_idx, thread) &&
+	    !perf_mmap__read_self(MMAP(evsel, cpu_map_idx, thread), count))
 		return 0;
 
 	if (readn(*fd, count->values, size) <= 0)
@@ -326,13 +328,13 @@ int perf_evsel__read(struct perf_evsel *evsel, int cpu, int thread,
 
 static int perf_evsel__run_ioctl(struct perf_evsel *evsel,
 				 int ioc,  void *arg,
-				 int cpu)
+				 int cpu_map_idx)
 {
 	int thread;
 
 	for (thread = 0; thread < xyarray__max_y(evsel->fd); thread++) {
 		int err;
-		int *fd = FD(evsel, cpu, thread);
+		int *fd = FD(evsel, cpu_map_idx, thread);
 
 		if (fd == NULL || *fd < 0)
 			return -1;
@@ -346,9 +348,9 @@ static int perf_evsel__run_ioctl(struct perf_evsel *evsel,
 	return 0;
 }
 
-int perf_evsel__enable_cpu(struct perf_evsel *evsel, int cpu)
+int perf_evsel__enable_cpu(struct perf_evsel *evsel, int cpu_map_idx)
 {
-	return perf_evsel__run_ioctl(evsel, PERF_EVENT_IOC_ENABLE, NULL, cpu);
+	return perf_evsel__run_ioctl(evsel, PERF_EVENT_IOC_ENABLE, NULL, cpu_map_idx);
 }
 
 int perf_evsel__enable(struct perf_evsel *evsel)
@@ -361,9 +363,9 @@ int perf_evsel__enable(struct perf_evsel *evsel)
 	return err;
 }
 
-int perf_evsel__disable_cpu(struct perf_evsel *evsel, int cpu)
+int perf_evsel__disable_cpu(struct perf_evsel *evsel, int cpu_map_idx)
 {
-	return perf_evsel__run_ioctl(evsel, PERF_EVENT_IOC_DISABLE, NULL, cpu);
+	return perf_evsel__run_ioctl(evsel, PERF_EVENT_IOC_DISABLE, NULL, cpu_map_idx);
 }
 
 int perf_evsel__disable(struct perf_evsel *evsel)
diff --git a/tools/lib/perf/include/perf/evsel.h b/tools/lib/perf/include/perf/evsel.h
index f401c7484bec..2a9516b42d15 100644
--- a/tools/lib/perf/include/perf/evsel.h
+++ b/tools/lib/perf/include/perf/evsel.h
@@ -28,16 +28,16 @@ LIBPERF_API void perf_evsel__delete(struct perf_evsel *evsel);
 LIBPERF_API int perf_evsel__open(struct perf_evsel *evsel, struct perf_cpu_map *cpus,
 				 struct perf_thread_map *threads);
 LIBPERF_API void perf_evsel__close(struct perf_evsel *evsel);
-LIBPERF_API void perf_evsel__close_cpu(struct perf_evsel *evsel, int cpu);
+LIBPERF_API void perf_evsel__close_cpu(struct perf_evsel *evsel, int cpu_map_idx);
 LIBPERF_API int perf_evsel__mmap(struct perf_evsel *evsel, int pages);
 LIBPERF_API void perf_evsel__munmap(struct perf_evsel *evsel);
-LIBPERF_API void *perf_evsel__mmap_base(struct perf_evsel *evsel, int cpu, int thread);
-LIBPERF_API int perf_evsel__read(struct perf_evsel *evsel, int cpu, int thread,
+LIBPERF_API void *perf_evsel__mmap_base(struct perf_evsel *evsel, int cpu_map_idx, int thread);
+LIBPERF_API int perf_evsel__read(struct perf_evsel *evsel, int cpu_map_idx, int thread,
 				 struct perf_counts_values *count);
 LIBPERF_API int perf_evsel__enable(struct perf_evsel *evsel);
-LIBPERF_API int perf_evsel__enable_cpu(struct perf_evsel *evsel, int cpu);
+LIBPERF_API int perf_evsel__enable_cpu(struct perf_evsel *evsel, int cpu_map_idx);
 LIBPERF_API int perf_evsel__disable(struct perf_evsel *evsel);
-LIBPERF_API int perf_evsel__disable_cpu(struct perf_evsel *evsel, int cpu);
+LIBPERF_API int perf_evsel__disable_cpu(struct perf_evsel *evsel, int cpu_map_idx);
 LIBPERF_API struct perf_cpu_map *perf_evsel__cpus(struct perf_evsel *evsel);
 LIBPERF_API struct perf_thread_map *perf_evsel__threads(struct perf_evsel *evsel);
 LIBPERF_API struct perf_event_attr *perf_evsel__attr(struct perf_evsel *evsel);
-- 
cgit v1.2.3


From 47ffe806674f67e729627edd689b10827b1790eb Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:31 -0800
Subject: libperf: Use cpu not index for evsel mmap

Fix issue where evsel's CPU map index was being used as the mmap cpu.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-29-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/perf/evsel.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/perf/evsel.c b/tools/lib/perf/evsel.c
index 8028b5a4da69..f1e1665ef4bd 100644
--- a/tools/lib/perf/evsel.c
+++ b/tools/lib/perf/evsel.c
@@ -252,6 +252,7 @@ int perf_evsel__mmap(struct perf_evsel *evsel, int pages)
 		for (thread = 0; thread < xyarray__max_y(evsel->fd); thread++) {
 			int *fd = FD(evsel, idx, thread);
 			struct perf_mmap *map;
+			int cpu = perf_cpu_map__cpu(evsel->cpus, idx);
 
 			if (fd == NULL || *fd < 0)
 				continue;
@@ -259,7 +260,7 @@ int perf_evsel__mmap(struct perf_evsel *evsel, int pages)
 			map = MMAP(evsel, idx, thread);
 			perf_mmap__init(map, NULL, false, NULL);
 
-			ret = perf_mmap__mmap(map, &mp, *fd, idx);
+			ret = perf_mmap__mmap(map, &mp, *fd, cpu);
 			if (ret) {
 				perf_evsel__munmap(evsel);
 				return ret;
-- 
cgit v1.2.3


From 7316268ff740c29dfb52649ff8074a5aa17ec0ce Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:32 -0800
Subject: perf counts: Switch name cpu to cpu_map_idx

Try to reduce confusion in particular when the cpu map doesn't contain
an entry for every CPU.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-30-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/counts.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/counts.h b/tools/perf/util/counts.h
index 3e275e9c60d1..5de275194f2b 100644
--- a/tools/perf/util/counts.h
+++ b/tools/perf/util/counts.h
@@ -18,21 +18,21 @@ struct perf_counts {
 
 
 static inline struct perf_counts_values*
-perf_counts(struct perf_counts *counts, int cpu, int thread)
+perf_counts(struct perf_counts *counts, int cpu_map_idx, int thread)
 {
-	return xyarray__entry(counts->values, cpu, thread);
+	return xyarray__entry(counts->values, cpu_map_idx, thread);
 }
 
 static inline bool
-perf_counts__is_loaded(struct perf_counts *counts, int cpu, int thread)
+perf_counts__is_loaded(struct perf_counts *counts, int cpu_map_idx, int thread)
 {
-	return *((bool *) xyarray__entry(counts->loaded, cpu, thread));
+	return *((bool *) xyarray__entry(counts->loaded, cpu_map_idx, thread));
 }
 
 static inline void
-perf_counts__set_loaded(struct perf_counts *counts, int cpu, int thread, bool loaded)
+perf_counts__set_loaded(struct perf_counts *counts, int cpu_map_idx, int thread, bool loaded)
 {
-	*((bool *) xyarray__entry(counts->loaded, cpu, thread)) = loaded;
+	*((bool *) xyarray__entry(counts->loaded, cpu_map_idx, thread)) = loaded;
 }
 
 struct perf_counts *perf_counts__new(int ncpus, int nthreads);
-- 
cgit v1.2.3


From ab90caa7b2d0b708cfee16b33325ca24de4d8f25 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:33 -0800
Subject: perf stat: Rename aggr_data cpu to imply it's an index

Trying to make cpu maps less error prone.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-31-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/stat-display.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index f48d1678861c..7e933a8fee68 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -626,7 +626,7 @@ struct aggr_data {
 	u64 ena, run, val;
 	struct aggr_cpu_id id;
 	int nr;
-	int cpu;
+	int cpu_map_idx;
 };
 
 static void aggr_cb(struct perf_stat_config *config,
@@ -878,9 +878,9 @@ static void counter_cb(struct perf_stat_config *config __maybe_unused,
 {
 	struct aggr_data *ad = data;
 
-	ad->val += perf_counts(counter->counts, ad->cpu, 0)->val;
-	ad->ena += perf_counts(counter->counts, ad->cpu, 0)->ena;
-	ad->run += perf_counts(counter->counts, ad->cpu, 0)->run;
+	ad->val += perf_counts(counter->counts, ad->cpu_map_idx, 0)->val;
+	ad->ena += perf_counts(counter->counts, ad->cpu_map_idx, 0)->ena;
+	ad->run += perf_counts(counter->counts, ad->cpu_map_idx, 0)->run;
 }
 
 /*
@@ -897,7 +897,7 @@ static void print_counter(struct perf_stat_config *config,
 	struct aggr_cpu_id id;
 
 	for (cpu = 0; cpu < evsel__nr_cpus(counter); cpu++) {
-		struct aggr_data ad = { .cpu = cpu };
+		struct aggr_data ad = { .cpu_map_idx = cpu };
 
 		if (!collect_data(config, counter, counter_cb, &ad))
 			return;
-- 
cgit v1.2.3


From 7ea82fbee4598e51e8bf47566b252cd5745d5b17 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:34 -0800
Subject: perf stat: Use perf_cpu_map__for_each_cpu()

Correct in print_counter() where an index was being used as a cpu.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-32-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/stat-display.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index 7e933a8fee68..0f192360b6c6 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -4,6 +4,7 @@
 #include <linux/string.h>
 #include <linux/time64.h>
 #include <math.h>
+#include <perf/cpumap.h>
 #include "color.h"
 #include "counts.h"
 #include "evlist.h"
@@ -732,7 +733,7 @@ static void print_aggr(struct perf_stat_config *config,
 		evlist__for_each_entry(evlist, counter) {
 			print_counter_aggrdata(config, counter, s,
 					       prefix, metric_only,
-					       &first, -1);
+					       &first, /*cpu=*/-1);
 		}
 		if (metric_only)
 			fputc('\n', output);
@@ -893,11 +894,11 @@ static void print_counter(struct perf_stat_config *config,
 	FILE *output = config->output;
 	u64 ena, run, val;
 	double uval;
-	int cpu;
+	int idx, cpu;
 	struct aggr_cpu_id id;
 
-	for (cpu = 0; cpu < evsel__nr_cpus(counter); cpu++) {
-		struct aggr_data ad = { .cpu_map_idx = cpu };
+	perf_cpu_map__for_each_cpu(cpu, idx, evsel__cpus(counter)) {
+		struct aggr_data ad = { .cpu_map_idx = idx };
 
 		if (!collect_data(config, counter, counter_cb, &ad))
 			return;
@@ -1248,7 +1249,7 @@ static void print_percore(struct perf_stat_config *config,
 
 		print_counter_aggrdata(config, counter, s,
 				       prefix, metric_only,
-				       &first, -1);
+				       &first, /*cpu=*/-1);
 	}
 
 	if (metric_only)
-- 
cgit v1.2.3


From f9551b3f6249cfe8ea5b5f8716675ccf2f6ec737 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:35 -0800
Subject: perf script: Use for each cpu to aid readability

Use perf_cpu_map__for_each_cpu() to help with readability.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-33-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-script.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index c9b3002ec254..f40319144856 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -2115,8 +2115,7 @@ static struct scripting_ops	*scripting_ops;
 static void __process_stat(struct evsel *counter, u64 tstamp)
 {
 	int nthreads = perf_thread_map__nr(counter->core.threads);
-	int ncpus = evsel__nr_cpus(counter);
-	int cpu, thread;
+	int idx, cpu, thread;
 	static int header_printed;
 
 	if (counter->core.system_wide)
@@ -2129,13 +2128,13 @@ static void __process_stat(struct evsel *counter, u64 tstamp)
 	}
 
 	for (thread = 0; thread < nthreads; thread++) {
-		for (cpu = 0; cpu < ncpus; cpu++) {
+		perf_cpu_map__for_each_cpu(cpu, idx, evsel__cpus(counter)) {
 			struct perf_counts_values *counts;
 
 			counts = perf_counts(counter->counts, cpu, thread);
 
 			printf("%3d %8d %15" PRIu64 " %15" PRIu64 " %15" PRIu64 " %15" PRIu64 " %s\n",
-				counter->core.cpus->map[cpu],
+				cpu,
 				perf_thread_map__pid(counter->core.threads, thread),
 				counts->val,
 				counts->ena,
-- 
cgit v1.2.3


From 80b82f3b65e94ba22d3f12a98f7ecc56cc14c903 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:36 -0800
Subject: libperf: Allow NULL in perf_cpu_map__idx()

Return -1, not found, if NULL is passed.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-34-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/perf/cpumap.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/perf/cpumap.c b/tools/lib/perf/cpumap.c
index 3c36a06771af..eacea3ab965a 100644
--- a/tools/lib/perf/cpumap.c
+++ b/tools/lib/perf/cpumap.c
@@ -270,8 +270,13 @@ bool perf_cpu_map__empty(const struct perf_cpu_map *map)
 
 int perf_cpu_map__idx(const struct perf_cpu_map *cpus, int cpu)
 {
-	int low = 0, high = cpus->nr;
+	int low, high;
 
+	if (!cpus)
+		return -1;
+
+	low = 0;
+	high = cpus->nr;
 	while (low < high) {
 		int idx = (low + high) / 2,
 		    cpu_at_idx = cpus->map[idx];
-- 
cgit v1.2.3


From 472832d2c000b9611feaea66fe521055c3dbf17a Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:37 -0800
Subject: perf evlist: Refactor evlist__for_each_cpu()

Previously evlist__for_each_cpu() needed to iterate over the evlist in
an inner loop and call "skip" routines. Refactor this so that the
iteratr is smarter and the next function can update both the current CPU
and evsel.

By using a cpu map index, fix apparent off-by-1 in __run_perf_stat's
call to perf_evsel__close_cpu().

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-35-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 179 ++++++++++++++++++++++------------------------
 tools/perf/util/evlist.c  | 146 ++++++++++++++++++++-----------------
 tools/perf/util/evlist.h  |  50 +++++++++++--
 tools/perf/util/evsel.h   |   1 -
 4 files changed, 210 insertions(+), 166 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index a518fcf0b3f8..f84116c9e016 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -405,36 +405,33 @@ static int read_counter_cpu(struct evsel *counter, struct timespec *rs, int cpu)
 
 static int read_affinity_counters(struct timespec *rs)
 {
-	struct evsel *counter;
-	struct affinity affinity;
-	int i, ncpus, cpu;
+	struct evlist_cpu_iterator evlist_cpu_itr;
+	struct affinity saved_affinity, *affinity;
 
 	if (all_counters_use_bpf)
 		return 0;
 
-	if (affinity__setup(&affinity) < 0)
+	if (!target__has_cpu(&target) || target__has_per_thread(&target))
+		affinity = NULL;
+	else if (affinity__setup(&saved_affinity) < 0)
 		return -1;
+	else
+		affinity = &saved_affinity;
 
-	ncpus = perf_cpu_map__nr(evsel_list->core.all_cpus);
-	if (!target__has_cpu(&target) || target__has_per_thread(&target))
-		ncpus = 1;
-	evlist__for_each_cpu(evsel_list, i, cpu) {
-		if (i >= ncpus)
-			break;
-		affinity__set(&affinity, cpu);
+	evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) {
+		struct evsel *counter = evlist_cpu_itr.evsel;
 
-		evlist__for_each_entry(evsel_list, counter) {
-			if (evsel__cpu_iter_skip(counter, cpu))
-				continue;
-			if (evsel__is_bpf(counter))
-				continue;
-			if (!counter->err) {
-				counter->err = read_counter_cpu(counter, rs,
-								counter->cpu_iter - 1);
-			}
+		if (evsel__is_bpf(counter))
+			continue;
+
+		if (!counter->err) {
+			counter->err = read_counter_cpu(counter, rs,
+							evlist_cpu_itr.cpu_map_idx);
 		}
 	}
-	affinity__cleanup(&affinity);
+	if (affinity)
+		affinity__cleanup(&saved_affinity);
+
 	return 0;
 }
 
@@ -788,8 +785,9 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 	int status = 0;
 	const bool forks = (argc > 0);
 	bool is_pipe = STAT_RECORD ? perf_stat.data.is_pipe : false;
+	struct evlist_cpu_iterator evlist_cpu_itr;
 	struct affinity affinity;
-	int i, cpu, err;
+	int err;
 	bool second_pass = false;
 
 	if (forks) {
@@ -813,102 +811,97 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 			all_counters_use_bpf = false;
 	}
 
-	evlist__for_each_cpu (evsel_list, i, cpu) {
+	evlist__for_each_cpu(evlist_cpu_itr, evsel_list, &affinity) {
+		counter = evlist_cpu_itr.evsel;
+
 		/*
 		 * bperf calls evsel__open_per_cpu() in bperf__load(), so
 		 * no need to call it again here.
 		 */
 		if (target.use_bpf)
 			break;
-		affinity__set(&affinity, cpu);
 
-		evlist__for_each_entry(evsel_list, counter) {
-			if (evsel__cpu_iter_skip(counter, cpu))
+		if (counter->reset_group || counter->errored)
+			continue;
+		if (evsel__is_bpf(counter))
+			continue;
+try_again:
+		if (create_perf_stat_counter(counter, &stat_config, &target,
+					     evlist_cpu_itr.cpu_map_idx) < 0) {
+
+			/*
+			 * Weak group failed. We cannot just undo this here
+			 * because earlier CPUs might be in group mode, and the kernel
+			 * doesn't support mixing group and non group reads. Defer
+			 * it to later.
+			 * Don't close here because we're in the wrong affinity.
+			 */
+			if ((errno == EINVAL || errno == EBADF) &&
+				evsel__leader(counter) != counter &&
+				counter->weak_group) {
+				evlist__reset_weak_group(evsel_list, counter, false);
+				assert(counter->reset_group);
+				second_pass = true;
 				continue;
-			if (counter->reset_group || counter->errored)
+			}
+
+			switch (stat_handle_error(counter)) {
+			case COUNTER_FATAL:
+				return -1;
+			case COUNTER_RETRY:
+				goto try_again;
+			case COUNTER_SKIP:
 				continue;
-			if (evsel__is_bpf(counter))
+			default:
+				break;
+			}
+
+		}
+		counter->supported = true;
+	}
+
+	if (second_pass) {
+		/*
+		 * Now redo all the weak group after closing them,
+		 * and also close errored counters.
+		 */
+
+		/* First close errored or weak retry */
+		evlist__for_each_cpu(evlist_cpu_itr, evsel_list, &affinity) {
+			counter = evlist_cpu_itr.evsel;
+
+			if (!counter->reset_group && !counter->errored)
 				continue;
-try_again:
+
+			perf_evsel__close_cpu(&counter->core, evlist_cpu_itr.cpu_map_idx);
+		}
+		/* Now reopen weak */
+		evlist__for_each_cpu(evlist_cpu_itr, evsel_list, &affinity) {
+			counter = evlist_cpu_itr.evsel;
+
+			if (!counter->reset_group && !counter->errored)
+				continue;
+			if (!counter->reset_group)
+				continue;
+try_again_reset:
+			pr_debug2("reopening weak %s\n", evsel__name(counter));
 			if (create_perf_stat_counter(counter, &stat_config, &target,
-						     counter->cpu_iter - 1) < 0) {
-
-				/*
-				 * Weak group failed. We cannot just undo this here
-				 * because earlier CPUs might be in group mode, and the kernel
-				 * doesn't support mixing group and non group reads. Defer
-				 * it to later.
-				 * Don't close here because we're in the wrong affinity.
-				 */
-				if ((errno == EINVAL || errno == EBADF) &&
-				    evsel__leader(counter) != counter &&
-				    counter->weak_group) {
-					evlist__reset_weak_group(evsel_list, counter, false);
-					assert(counter->reset_group);
-					second_pass = true;
-					continue;
-				}
+						     evlist_cpu_itr.cpu_map_idx) < 0) {
 
 				switch (stat_handle_error(counter)) {
 				case COUNTER_FATAL:
 					return -1;
 				case COUNTER_RETRY:
-					goto try_again;
+					goto try_again_reset;
 				case COUNTER_SKIP:
 					continue;
 				default:
 					break;
 				}
-
 			}
 			counter->supported = true;
 		}
 	}
-
-	if (second_pass) {
-		/*
-		 * Now redo all the weak group after closing them,
-		 * and also close errored counters.
-		 */
-
-		evlist__for_each_cpu(evsel_list, i, cpu) {
-			affinity__set(&affinity, cpu);
-			/* First close errored or weak retry */
-			evlist__for_each_entry(evsel_list, counter) {
-				if (!counter->reset_group && !counter->errored)
-					continue;
-				if (evsel__cpu_iter_skip_no_inc(counter, cpu))
-					continue;
-				perf_evsel__close_cpu(&counter->core, counter->cpu_iter);
-			}
-			/* Now reopen weak */
-			evlist__for_each_entry(evsel_list, counter) {
-				if (!counter->reset_group && !counter->errored)
-					continue;
-				if (evsel__cpu_iter_skip(counter, cpu))
-					continue;
-				if (!counter->reset_group)
-					continue;
-try_again_reset:
-				pr_debug2("reopening weak %s\n", evsel__name(counter));
-				if (create_perf_stat_counter(counter, &stat_config, &target,
-							     counter->cpu_iter - 1) < 0) {
-
-					switch (stat_handle_error(counter)) {
-					case COUNTER_FATAL:
-						return -1;
-					case COUNTER_RETRY:
-						goto try_again_reset;
-					case COUNTER_SKIP:
-						continue;
-					default:
-						break;
-					}
-				}
-				counter->supported = true;
-			}
-		}
-	}
 	affinity__cleanup(&affinity);
 
 	evlist__for_each_entry(evsel_list, counter) {
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 5f92319ce258..39d294f6c321 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -342,36 +342,65 @@ static int evlist__nr_threads(struct evlist *evlist, struct evsel *evsel)
 		return perf_thread_map__nr(evlist->core.threads);
 }
 
-void evlist__cpu_iter_start(struct evlist *evlist)
-{
-	struct evsel *pos;
-
-	/*
-	 * Reset the per evsel cpu_iter. This is needed because
-	 * each evsel's cpumap may have a different index space,
-	 * and some operations need the index to modify
-	 * the FD xyarray (e.g. open, close)
-	 */
-	evlist__for_each_entry(evlist, pos)
-		pos->cpu_iter = 0;
-}
+struct evlist_cpu_iterator evlist__cpu_begin(struct evlist *evlist, struct affinity *affinity)
+{
+	struct evlist_cpu_iterator itr = {
+		.container = evlist,
+		.evsel = evlist__first(evlist),
+		.cpu_map_idx = 0,
+		.evlist_cpu_map_idx = 0,
+		.evlist_cpu_map_nr = perf_cpu_map__nr(evlist->core.all_cpus),
+		.cpu = -1,
+		.affinity = affinity,
+	};
 
-bool evsel__cpu_iter_skip_no_inc(struct evsel *ev, int cpu)
-{
-	if (ev->cpu_iter >= ev->core.cpus->nr)
-		return true;
-	if (cpu >= 0 && ev->core.cpus->map[ev->cpu_iter] != cpu)
-		return true;
-	return false;
+	if (itr.affinity) {
+		itr.cpu = perf_cpu_map__cpu(evlist->core.all_cpus, 0);
+		affinity__set(itr.affinity, itr.cpu);
+		itr.cpu_map_idx = perf_cpu_map__idx(itr.evsel->core.cpus, itr.cpu);
+		/*
+		 * If this CPU isn't in the evsel's cpu map then advance through
+		 * the list.
+		 */
+		if (itr.cpu_map_idx == -1)
+			evlist_cpu_iterator__next(&itr);
+	}
+	return itr;
+}
+
+void evlist_cpu_iterator__next(struct evlist_cpu_iterator *evlist_cpu_itr)
+{
+	while (evlist_cpu_itr->evsel != evlist__last(evlist_cpu_itr->container)) {
+		evlist_cpu_itr->evsel = evsel__next(evlist_cpu_itr->evsel);
+		evlist_cpu_itr->cpu_map_idx =
+			perf_cpu_map__idx(evlist_cpu_itr->evsel->core.cpus,
+					  evlist_cpu_itr->cpu);
+		if (evlist_cpu_itr->cpu_map_idx != -1)
+			return;
+	}
+	evlist_cpu_itr->evlist_cpu_map_idx++;
+	if (evlist_cpu_itr->evlist_cpu_map_idx < evlist_cpu_itr->evlist_cpu_map_nr) {
+		evlist_cpu_itr->evsel = evlist__first(evlist_cpu_itr->container);
+		evlist_cpu_itr->cpu =
+			perf_cpu_map__cpu(evlist_cpu_itr->container->core.all_cpus,
+					  evlist_cpu_itr->evlist_cpu_map_idx);
+		if (evlist_cpu_itr->affinity)
+			affinity__set(evlist_cpu_itr->affinity, evlist_cpu_itr->cpu);
+		evlist_cpu_itr->cpu_map_idx =
+			perf_cpu_map__idx(evlist_cpu_itr->evsel->core.cpus,
+					  evlist_cpu_itr->cpu);
+		/*
+		 * If this CPU isn't in the evsel's cpu map then advance through
+		 * the list.
+		 */
+		if (evlist_cpu_itr->cpu_map_idx == -1)
+			evlist_cpu_iterator__next(evlist_cpu_itr);
+	}
 }
 
-bool evsel__cpu_iter_skip(struct evsel *ev, int cpu)
+bool evlist_cpu_iterator__end(const struct evlist_cpu_iterator *evlist_cpu_itr)
 {
-	if (!evsel__cpu_iter_skip_no_inc(ev, cpu)) {
-		ev->cpu_iter++;
-		return false;
-	}
-	return true;
+	return evlist_cpu_itr->evlist_cpu_map_idx >= evlist_cpu_itr->evlist_cpu_map_nr;
 }
 
 static int evsel__strcmp(struct evsel *pos, char *evsel_name)
@@ -400,31 +429,26 @@ static int evlist__is_enabled(struct evlist *evlist)
 static void __evlist__disable(struct evlist *evlist, char *evsel_name)
 {
 	struct evsel *pos;
+	struct evlist_cpu_iterator evlist_cpu_itr;
 	struct affinity affinity;
-	int cpu, i, imm = 0;
 	bool has_imm = false;
 
 	if (affinity__setup(&affinity) < 0)
 		return;
 
 	/* Disable 'immediate' events last */
-	for (imm = 0; imm <= 1; imm++) {
-		evlist__for_each_cpu(evlist, i, cpu) {
-			affinity__set(&affinity, cpu);
-
-			evlist__for_each_entry(evlist, pos) {
-				if (evsel__strcmp(pos, evsel_name))
-					continue;
-				if (evsel__cpu_iter_skip(pos, cpu))
-					continue;
-				if (pos->disabled || !evsel__is_group_leader(pos) || !pos->core.fd)
-					continue;
-				if (pos->immediate)
-					has_imm = true;
-				if (pos->immediate != imm)
-					continue;
-				evsel__disable_cpu(pos, pos->cpu_iter - 1);
-			}
+	for (int imm = 0; imm <= 1; imm++) {
+		evlist__for_each_cpu(evlist_cpu_itr, evlist, &affinity) {
+			pos = evlist_cpu_itr.evsel;
+			if (evsel__strcmp(pos, evsel_name))
+				continue;
+			if (pos->disabled || !evsel__is_group_leader(pos) || !pos->core.fd)
+				continue;
+			if (pos->immediate)
+				has_imm = true;
+			if (pos->immediate != imm)
+				continue;
+			evsel__disable_cpu(pos, evlist_cpu_itr.cpu_map_idx);
 		}
 		if (!has_imm)
 			break;
@@ -462,24 +486,19 @@ void evlist__disable_evsel(struct evlist *evlist, char *evsel_name)
 static void __evlist__enable(struct evlist *evlist, char *evsel_name)
 {
 	struct evsel *pos;
+	struct evlist_cpu_iterator evlist_cpu_itr;
 	struct affinity affinity;
-	int cpu, i;
 
 	if (affinity__setup(&affinity) < 0)
 		return;
 
-	evlist__for_each_cpu(evlist, i, cpu) {
-		affinity__set(&affinity, cpu);
-
-		evlist__for_each_entry(evlist, pos) {
-			if (evsel__strcmp(pos, evsel_name))
-				continue;
-			if (evsel__cpu_iter_skip(pos, cpu))
-				continue;
-			if (!evsel__is_group_leader(pos) || !pos->core.fd)
-				continue;
-			evsel__enable_cpu(pos, pos->cpu_iter - 1);
-		}
+	evlist__for_each_cpu(evlist_cpu_itr, evlist, &affinity) {
+		pos = evlist_cpu_itr.evsel;
+		if (evsel__strcmp(pos, evsel_name))
+			continue;
+		if (!evsel__is_group_leader(pos) || !pos->core.fd)
+			continue;
+		evsel__enable_cpu(pos, evlist_cpu_itr.cpu_map_idx);
 	}
 	affinity__cleanup(&affinity);
 	evlist__for_each_entry(evlist, pos) {
@@ -1264,8 +1283,8 @@ void evlist__set_selected(struct evlist *evlist, struct evsel *evsel)
 void evlist__close(struct evlist *evlist)
 {
 	struct evsel *evsel;
+	struct evlist_cpu_iterator evlist_cpu_itr;
 	struct affinity affinity;
-	int cpu, i;
 
 	/*
 	 * With perf record core.cpus is usually NULL.
@@ -1279,15 +1298,12 @@ void evlist__close(struct evlist *evlist)
 
 	if (affinity__setup(&affinity) < 0)
 		return;
-	evlist__for_each_cpu(evlist, i, cpu) {
-		affinity__set(&affinity, cpu);
 
-		evlist__for_each_entry_reverse(evlist, evsel) {
-			if (evsel__cpu_iter_skip(evsel, cpu))
-			    continue;
-			perf_evsel__close_cpu(&evsel->core, evsel->cpu_iter - 1);
-		}
+	evlist__for_each_cpu(evlist_cpu_itr, evlist, &affinity) {
+		perf_evsel__close_cpu(&evlist_cpu_itr.evsel->core,
+				      evlist_cpu_itr.cpu_map_idx);
 	}
+
 	affinity__cleanup(&affinity);
 	evlist__for_each_entry_reverse(evlist, evsel) {
 		perf_evsel__free_fd(&evsel->core);
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 27594900a052..57828ebfcb61 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -327,17 +327,53 @@ void evlist__to_front(struct evlist *evlist, struct evsel *move_evsel);
 #define evlist__for_each_entry_safe(evlist, tmp, evsel) \
 	__evlist__for_each_entry_safe(&(evlist)->core.entries, tmp, evsel)
 
-#define evlist__for_each_cpu(evlist, index, cpu)	\
-	evlist__cpu_iter_start(evlist);			\
-	perf_cpu_map__for_each_cpu (cpu, index, (evlist)->core.all_cpus)
+/** Iterator state for evlist__for_each_cpu */
+struct evlist_cpu_iterator {
+	/** The list being iterated through. */
+	struct evlist *container;
+	/** The current evsel of the iterator. */
+	struct evsel *evsel;
+	/** The CPU map index corresponding to the evsel->core.cpus for the current CPU. */
+	int cpu_map_idx;
+	/**
+	 * The CPU map index corresponding to evlist->core.all_cpus for the
+	 * current CPU.  Distinct from cpu_map_idx as the evsel's cpu map may
+	 * contain fewer entries.
+	 */
+	int evlist_cpu_map_idx;
+	/** The number of CPU map entries in evlist->core.all_cpus. */
+	int evlist_cpu_map_nr;
+	/** The current CPU of the iterator. */
+	int cpu;
+	/** If present, used to set the affinity when switching between CPUs. */
+	struct affinity *affinity;
+};
+
+/**
+ * evlist__for_each_cpu - without affinity, iterate over the evlist. With
+ *                        affinity, iterate over all CPUs and then the evlist
+ *                        for each evsel on that CPU. When switching between
+ *                        CPUs the affinity is set to the CPU to avoid IPIs
+ *                        during syscalls.
+ * @evlist_cpu_itr: the iterator instance.
+ * @evlist: evlist instance to iterate.
+ * @affinity: NULL or used to set the affinity to the current CPU.
+ */
+#define evlist__for_each_cpu(evlist_cpu_itr, evlist, affinity)		\
+	for ((evlist_cpu_itr) = evlist__cpu_begin(evlist, affinity);	\
+	     !evlist_cpu_iterator__end(&evlist_cpu_itr);		\
+	     evlist_cpu_iterator__next(&evlist_cpu_itr))
+
+/** Returns an iterator set to the first CPU/evsel of evlist. */
+struct evlist_cpu_iterator evlist__cpu_begin(struct evlist *evlist, struct affinity *affinity);
+/** Move to next element in iterator, updating CPU, evsel and the affinity. */
+void evlist_cpu_iterator__next(struct evlist_cpu_iterator *evlist_cpu_itr);
+/** Returns true when iterator is at the end of the CPUs and evlist. */
+bool evlist_cpu_iterator__end(const struct evlist_cpu_iterator *evlist_cpu_itr);
 
 struct evsel *evlist__get_tracking_event(struct evlist *evlist);
 void evlist__set_tracking_event(struct evlist *evlist, struct evsel *tracking_evsel);
 
-void evlist__cpu_iter_start(struct evlist *evlist);
-bool evsel__cpu_iter_skip(struct evsel *ev, int cpu);
-bool evsel__cpu_iter_skip_no_inc(struct evsel *ev, int cpu);
-
 struct evsel *evlist__find_evsel_by_str(struct evlist *evlist, const char *str);
 
 struct evsel *evlist__event2evsel(struct evlist *evlist, union perf_event *event);
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 99aa3363def7..7cb7c9c77ab0 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -121,7 +121,6 @@ struct evsel {
 	bool			errored;
 	struct hashmap		*per_pkg_mask;
 	int			err;
-	int			cpu_iter;
 	struct {
 		evsel__sb_cb_t	*cb;
 		void		*data;
-- 
cgit v1.2.3


From 7ac0089d138f80dcd7ba8ca368a9b2bdfe780b16 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:38 -0800
Subject: perf evsel: Pass cpu not cpu map index to synthesize

evsel__write_stat_event() was incorrectly passing a cpu map index rather
than a CPU to perf_event__synthesize_stat().

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-36-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index f84116c9e016..ed993c20772f 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -327,10 +327,11 @@ static int write_stat_round_event(u64 tm, u64 type)
 
 #define SID(e, x, y) xyarray__entry(e->core.sample_id, x, y)
 
-static int evsel__write_stat_event(struct evsel *counter, u32 cpu, u32 thread,
+static int evsel__write_stat_event(struct evsel *counter, int cpu_map_idx, u32 thread,
 				   struct perf_counts_values *count)
 {
-	struct perf_sample_id *sid = SID(counter, cpu, thread);
+	struct perf_sample_id *sid = SID(counter, cpu_map_idx, thread);
+	int cpu = perf_cpu_map__cpu(evsel__cpus(counter), cpu_map_idx);
 
 	return perf_event__synthesize_stat(NULL, cpu, thread, sid->id, count,
 					   process_synthesized_event, NULL);
-- 
cgit v1.2.3


From da8c94c065174099853a207d9716a49d339b265f Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:39 -0800
Subject: perf stat: Correct variable name for read counter

Switch from cpu to cpu_map_idx to reduce confusion.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-37-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 24 ++++++++++++------------
 tools/perf/util/evsel.c   | 30 +++++++++++++++---------------
 tools/perf/util/evsel.h   | 16 ++++++++--------
 3 files changed, 35 insertions(+), 35 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index ed993c20772f..dfb8f7847e6c 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -337,25 +337,25 @@ static int evsel__write_stat_event(struct evsel *counter, int cpu_map_idx, u32 t
 					   process_synthesized_event, NULL);
 }
 
-static int read_single_counter(struct evsel *counter, int cpu,
+static int read_single_counter(struct evsel *counter, int cpu_map_idx,
 			       int thread, struct timespec *rs)
 {
 	if (counter->tool_event == PERF_TOOL_DURATION_TIME) {
 		u64 val = rs->tv_nsec + rs->tv_sec*1000000000ULL;
 		struct perf_counts_values *count =
-			perf_counts(counter->counts, cpu, thread);
+			perf_counts(counter->counts, cpu_map_idx, thread);
 		count->ena = count->run = val;
 		count->val = val;
 		return 0;
 	}
-	return evsel__read_counter(counter, cpu, thread);
+	return evsel__read_counter(counter, cpu_map_idx, thread);
 }
 
 /*
  * Read out the results of a single counter:
  * do not aggregate counts across CPUs in system-wide mode
  */
-static int read_counter_cpu(struct evsel *counter, struct timespec *rs, int cpu)
+static int read_counter_cpu(struct evsel *counter, struct timespec *rs, int cpu_map_idx)
 {
 	int nthreads = perf_thread_map__nr(evsel_list->core.threads);
 	int thread;
@@ -369,24 +369,24 @@ static int read_counter_cpu(struct evsel *counter, struct timespec *rs, int cpu)
 	for (thread = 0; thread < nthreads; thread++) {
 		struct perf_counts_values *count;
 
-		count = perf_counts(counter->counts, cpu, thread);
+		count = perf_counts(counter->counts, cpu_map_idx, thread);
 
 		/*
 		 * The leader's group read loads data into its group members
 		 * (via evsel__read_counter()) and sets their count->loaded.
 		 */
-		if (!perf_counts__is_loaded(counter->counts, cpu, thread) &&
-		    read_single_counter(counter, cpu, thread, rs)) {
+		if (!perf_counts__is_loaded(counter->counts, cpu_map_idx, thread) &&
+		    read_single_counter(counter, cpu_map_idx, thread, rs)) {
 			counter->counts->scaled = -1;
-			perf_counts(counter->counts, cpu, thread)->ena = 0;
-			perf_counts(counter->counts, cpu, thread)->run = 0;
+			perf_counts(counter->counts, cpu_map_idx, thread)->ena = 0;
+			perf_counts(counter->counts, cpu_map_idx, thread)->run = 0;
 			return -1;
 		}
 
-		perf_counts__set_loaded(counter->counts, cpu, thread, false);
+		perf_counts__set_loaded(counter->counts, cpu_map_idx, thread, false);
 
 		if (STAT_RECORD) {
-			if (evsel__write_stat_event(counter, cpu, thread, count)) {
+			if (evsel__write_stat_event(counter, cpu_map_idx, thread, count)) {
 				pr_err("failed to write stat event\n");
 				return -1;
 			}
@@ -396,7 +396,7 @@ static int read_counter_cpu(struct evsel *counter, struct timespec *rs, int cpu)
 			fprintf(stat_config.output,
 				"%s: %d: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n",
 					evsel__name(counter),
-					cpu,
+					perf_cpu_map__cpu(evsel__cpus(counter), cpu_map_idx),
 					count->val, count->ena, count->run);
 		}
 	}
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 2de569a1a272..8f539a81b30b 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1476,11 +1476,11 @@ void evsel__compute_deltas(struct evsel *evsel, int cpu, int thread,
 	count->run = count->run - tmp.run;
 }
 
-static int evsel__read_one(struct evsel *evsel, int cpu, int thread)
+static int evsel__read_one(struct evsel *evsel, int cpu_map_idx, int thread)
 {
-	struct perf_counts_values *count = perf_counts(evsel->counts, cpu, thread);
+	struct perf_counts_values *count = perf_counts(evsel->counts, cpu_map_idx, thread);
 
-	return perf_evsel__read(&evsel->core, cpu, thread, count);
+	return perf_evsel__read(&evsel->core, cpu_map_idx, thread, count);
 }
 
 static void evsel__set_count(struct evsel *counter, int cpu, int thread, u64 val, u64 ena, u64 run)
@@ -1530,7 +1530,7 @@ static int evsel__process_group_data(struct evsel *leader, int cpu, int thread,
 	return 0;
 }
 
-static int evsel__read_group(struct evsel *leader, int cpu, int thread)
+static int evsel__read_group(struct evsel *leader, int cpu_map_idx, int thread)
 {
 	struct perf_stat_evsel *ps = leader->stats;
 	u64 read_format = leader->core.attr.read_format;
@@ -1551,42 +1551,42 @@ static int evsel__read_group(struct evsel *leader, int cpu, int thread)
 		ps->group_data = data;
 	}
 
-	if (FD(leader, cpu, thread) < 0)
+	if (FD(leader, cpu_map_idx, thread) < 0)
 		return -EINVAL;
 
-	if (readn(FD(leader, cpu, thread), data, size) <= 0)
+	if (readn(FD(leader, cpu_map_idx, thread), data, size) <= 0)
 		return -errno;
 
-	return evsel__process_group_data(leader, cpu, thread, data);
+	return evsel__process_group_data(leader, cpu_map_idx, thread, data);
 }
 
-int evsel__read_counter(struct evsel *evsel, int cpu, int thread)
+int evsel__read_counter(struct evsel *evsel, int cpu_map_idx, int thread)
 {
 	u64 read_format = evsel->core.attr.read_format;
 
 	if (read_format & PERF_FORMAT_GROUP)
-		return evsel__read_group(evsel, cpu, thread);
+		return evsel__read_group(evsel, cpu_map_idx, thread);
 
-	return evsel__read_one(evsel, cpu, thread);
+	return evsel__read_one(evsel, cpu_map_idx, thread);
 }
 
-int __evsel__read_on_cpu(struct evsel *evsel, int cpu, int thread, bool scale)
+int __evsel__read_on_cpu(struct evsel *evsel, int cpu_map_idx, int thread, bool scale)
 {
 	struct perf_counts_values count;
 	size_t nv = scale ? 3 : 1;
 
-	if (FD(evsel, cpu, thread) < 0)
+	if (FD(evsel, cpu_map_idx, thread) < 0)
 		return -EINVAL;
 
 	if (evsel->counts == NULL && evsel__alloc_counts(evsel) < 0)
 		return -ENOMEM;
 
-	if (readn(FD(evsel, cpu, thread), &count, nv * sizeof(u64)) <= 0)
+	if (readn(FD(evsel, cpu_map_idx, thread), &count, nv * sizeof(u64)) <= 0)
 		return -errno;
 
-	evsel__compute_deltas(evsel, cpu, thread, &count);
+	evsel__compute_deltas(evsel, cpu_map_idx, thread, &count);
 	perf_counts_values__scale(&count, scale, NULL);
-	*perf_counts(evsel->counts, cpu, thread) = count;
+	*perf_counts(evsel->counts, cpu_map_idx, thread) = count;
 	return 0;
 }
 
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 7cb7c9c77ab0..c3db41282400 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -333,32 +333,32 @@ static inline bool evsel__match2(struct evsel *e1, struct evsel *e2)
 	       (e1->core.attr.config == e2->core.attr.config);
 }
 
-int evsel__read_counter(struct evsel *evsel, int cpu, int thread);
+int evsel__read_counter(struct evsel *evsel, int cpu_map_idx, int thread);
 
-int __evsel__read_on_cpu(struct evsel *evsel, int cpu, int thread, bool scale);
+int __evsel__read_on_cpu(struct evsel *evsel, int cpu_map_idx, int thread, bool scale);
 
 /**
  * evsel__read_on_cpu - Read out the results on a CPU and thread
  *
  * @evsel - event selector to read value
- * @cpu - CPU of interest
+ * @cpu_map_idx - CPU of interest
  * @thread - thread of interest
  */
-static inline int evsel__read_on_cpu(struct evsel *evsel, int cpu, int thread)
+static inline int evsel__read_on_cpu(struct evsel *evsel, int cpu_map_idx, int thread)
 {
-	return __evsel__read_on_cpu(evsel, cpu, thread, false);
+	return __evsel__read_on_cpu(evsel, cpu_map_idx, thread, false);
 }
 
 /**
  * evsel__read_on_cpu_scaled - Read out the results on a CPU and thread, scaled
  *
  * @evsel - event selector to read value
- * @cpu - CPU of interest
+ * @cpu_map_idx - CPU of interest
  * @thread - thread of interest
  */
-static inline int evsel__read_on_cpu_scaled(struct evsel *evsel, int cpu, int thread)
+static inline int evsel__read_on_cpu_scaled(struct evsel *evsel, int cpu_map_idx, int thread)
 {
-	return __evsel__read_on_cpu(evsel, cpu, thread, true);
+	return __evsel__read_on_cpu(evsel, cpu_map_idx, thread, true);
 }
 
 int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
-- 
cgit v1.2.3


From 2daa08c4d9cd9d0845094d718920e5d105c11558 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:40 -0800
Subject: perf evsel: Rename CPU around get_group_fd

CPU is really a cpu map index, change names to make code more intention
revealing.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-38-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evsel.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 8f539a81b30b..45338be3f501 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1591,27 +1591,27 @@ int __evsel__read_on_cpu(struct evsel *evsel, int cpu_map_idx, int thread, bool
 }
 
 static int evsel__match_other_cpu(struct evsel *evsel, struct evsel *other,
-				  int cpu)
+				  int cpu_map_idx)
 {
-	int cpuid;
+	int cpu;
 
-	cpuid = perf_cpu_map__cpu(evsel->core.cpus, cpu);
-	return perf_cpu_map__idx(other->core.cpus, cpuid);
+	cpu = perf_cpu_map__cpu(evsel->core.cpus, cpu_map_idx);
+	return perf_cpu_map__idx(other->core.cpus, cpu);
 }
 
-static int evsel__hybrid_group_cpu(struct evsel *evsel, int cpu)
+static int evsel__hybrid_group_cpu_map_idx(struct evsel *evsel, int cpu_map_idx)
 {
 	struct evsel *leader = evsel__leader(evsel);
 
 	if ((evsel__is_hybrid(evsel) && !evsel__is_hybrid(leader)) ||
 	    (!evsel__is_hybrid(evsel) && evsel__is_hybrid(leader))) {
-		return evsel__match_other_cpu(evsel, leader, cpu);
+		return evsel__match_other_cpu(evsel, leader, cpu_map_idx);
 	}
 
-	return cpu;
+	return cpu_map_idx;
 }
 
-static int get_group_fd(struct evsel *evsel, int cpu, int thread)
+static int get_group_fd(struct evsel *evsel, int cpu_map_idx, int thread)
 {
 	struct evsel *leader = evsel__leader(evsel);
 	int fd;
@@ -1625,11 +1625,11 @@ static int get_group_fd(struct evsel *evsel, int cpu, int thread)
 	 */
 	BUG_ON(!leader->core.fd);
 
-	cpu = evsel__hybrid_group_cpu(evsel, cpu);
-	if (cpu == -1)
+	cpu_map_idx = evsel__hybrid_group_cpu_map_idx(evsel, cpu_map_idx);
+	if (cpu_map_idx == -1)
 		return -1;
 
-	fd = FD(leader, cpu, thread);
+	fd = FD(leader, cpu_map_idx, thread);
 	BUG_ON(fd == -1);
 
 	return fd;
-- 
cgit v1.2.3


From 1fa497d4c01d497e25131ccdd5def6f24dd1f330 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:41 -0800
Subject: perf evsel: Reduce scope of evsel__ignore_missing_thread

Move to being static.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-39-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evsel.c | 8 ++++----
 tools/perf/util/evsel.h | 4 ----
 2 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 45338be3f501..97348d302156 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1666,10 +1666,10 @@ static int update_fds(struct evsel *evsel,
 	return 0;
 }
 
-bool evsel__ignore_missing_thread(struct evsel *evsel,
-				  int nr_cpus, int cpu,
-				  struct perf_thread_map *threads,
-				  int thread, int err)
+static bool evsel__ignore_missing_thread(struct evsel *evsel,
+					 int nr_cpus, int cpu,
+					 struct perf_thread_map *threads,
+					 int thread, int err)
 {
 	pid_t ignore_pid = perf_thread_map__pid(threads, thread);
 
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index c3db41282400..84e597f6c395 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -301,10 +301,6 @@ bool evsel__detect_missing_features(struct evsel *evsel);
 enum rlimit_action { NO_CHANGE, SET_TO_MAX, INCREASED_MAX };
 bool evsel__increase_rlimit(enum rlimit_action *set_rlimit);
 
-bool evsel__ignore_missing_thread(struct evsel *evsel,
-				  int nr_cpus, int cpu,
-				  struct perf_thread_map *threads,
-				  int thread, int err);
 bool evsel__precise_ip_fallback(struct evsel *evsel);
 
 struct perf_sample;
-- 
cgit v1.2.3


From 6f844b1fdd3bc3a25995ff83edea32a73bfa72d9 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:42 -0800
Subject: perf evsel: Rename variable cpu to index

Make naming less error prone.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-40-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evsel.c | 83 +++++++++++++++++++++++++------------------------
 tools/perf/util/evsel.h |  6 ++--
 tools/perf/util/stat.c  |  4 +--
 tools/perf/util/stat.h  |  2 +-
 4 files changed, 48 insertions(+), 47 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 97348d302156..796923c80ff6 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1372,9 +1372,9 @@ int evsel__append_addr_filter(struct evsel *evsel, const char *filter)
 }
 
 /* Caller has to clear disabled after going through all CPUs. */
-int evsel__enable_cpu(struct evsel *evsel, int cpu)
+int evsel__enable_cpu(struct evsel *evsel, int cpu_map_idx)
 {
-	return perf_evsel__enable_cpu(&evsel->core, cpu);
+	return perf_evsel__enable_cpu(&evsel->core, cpu_map_idx);
 }
 
 int evsel__enable(struct evsel *evsel)
@@ -1387,9 +1387,9 @@ int evsel__enable(struct evsel *evsel)
 }
 
 /* Caller has to set disabled after going through all CPUs. */
-int evsel__disable_cpu(struct evsel *evsel, int cpu)
+int evsel__disable_cpu(struct evsel *evsel, int cpu_map_idx)
 {
-	return perf_evsel__disable_cpu(&evsel->core, cpu);
+	return perf_evsel__disable_cpu(&evsel->core, cpu_map_idx);
 }
 
 int evsel__disable(struct evsel *evsel)
@@ -1455,7 +1455,7 @@ void evsel__delete(struct evsel *evsel)
 	free(evsel);
 }
 
-void evsel__compute_deltas(struct evsel *evsel, int cpu, int thread,
+void evsel__compute_deltas(struct evsel *evsel, int cpu_map_idx, int thread,
 			   struct perf_counts_values *count)
 {
 	struct perf_counts_values tmp;
@@ -1463,12 +1463,12 @@ void evsel__compute_deltas(struct evsel *evsel, int cpu, int thread,
 	if (!evsel->prev_raw_counts)
 		return;
 
-	if (cpu == -1) {
+	if (cpu_map_idx == -1) {
 		tmp = evsel->prev_raw_counts->aggr;
 		evsel->prev_raw_counts->aggr = *count;
 	} else {
-		tmp = *perf_counts(evsel->prev_raw_counts, cpu, thread);
-		*perf_counts(evsel->prev_raw_counts, cpu, thread) = *count;
+		tmp = *perf_counts(evsel->prev_raw_counts, cpu_map_idx, thread);
+		*perf_counts(evsel->prev_raw_counts, cpu_map_idx, thread) = *count;
 	}
 
 	count->val = count->val - tmp.val;
@@ -1483,20 +1483,21 @@ static int evsel__read_one(struct evsel *evsel, int cpu_map_idx, int thread)
 	return perf_evsel__read(&evsel->core, cpu_map_idx, thread, count);
 }
 
-static void evsel__set_count(struct evsel *counter, int cpu, int thread, u64 val, u64 ena, u64 run)
+static void evsel__set_count(struct evsel *counter, int cpu_map_idx, int thread,
+			     u64 val, u64 ena, u64 run)
 {
 	struct perf_counts_values *count;
 
-	count = perf_counts(counter->counts, cpu, thread);
+	count = perf_counts(counter->counts, cpu_map_idx, thread);
 
 	count->val    = val;
 	count->ena    = ena;
 	count->run    = run;
 
-	perf_counts__set_loaded(counter->counts, cpu, thread, true);
+	perf_counts__set_loaded(counter->counts, cpu_map_idx, thread, true);
 }
 
-static int evsel__process_group_data(struct evsel *leader, int cpu, int thread, u64 *data)
+static int evsel__process_group_data(struct evsel *leader, int cpu_map_idx, int thread, u64 *data)
 {
 	u64 read_format = leader->core.attr.read_format;
 	struct sample_read_value *v;
@@ -1515,7 +1516,7 @@ static int evsel__process_group_data(struct evsel *leader, int cpu, int thread,
 
 	v = (struct sample_read_value *) data;
 
-	evsel__set_count(leader, cpu, thread, v[0].value, ena, run);
+	evsel__set_count(leader, cpu_map_idx, thread, v[0].value, ena, run);
 
 	for (i = 1; i < nr; i++) {
 		struct evsel *counter;
@@ -1524,7 +1525,7 @@ static int evsel__process_group_data(struct evsel *leader, int cpu, int thread,
 		if (!counter)
 			return -EINVAL;
 
-		evsel__set_count(counter, cpu, thread, v[i].value, ena, run);
+		evsel__set_count(counter, cpu_map_idx, thread, v[i].value, ena, run);
 	}
 
 	return 0;
@@ -1643,16 +1644,16 @@ static void evsel__remove_fd(struct evsel *pos, int nr_cpus, int nr_threads, int
 }
 
 static int update_fds(struct evsel *evsel,
-		      int nr_cpus, int cpu_idx,
+		      int nr_cpus, int cpu_map_idx,
 		      int nr_threads, int thread_idx)
 {
 	struct evsel *pos;
 
-	if (cpu_idx >= nr_cpus || thread_idx >= nr_threads)
+	if (cpu_map_idx >= nr_cpus || thread_idx >= nr_threads)
 		return -EINVAL;
 
 	evlist__for_each_entry(evsel->evlist, pos) {
-		nr_cpus = pos != evsel ? nr_cpus : cpu_idx;
+		nr_cpus = pos != evsel ? nr_cpus : cpu_map_idx;
 
 		evsel__remove_fd(pos, nr_cpus, nr_threads, thread_idx);
 
@@ -1667,7 +1668,7 @@ static int update_fds(struct evsel *evsel,
 }
 
 static bool evsel__ignore_missing_thread(struct evsel *evsel,
-					 int nr_cpus, int cpu,
+					 int nr_cpus, int cpu_map_idx,
 					 struct perf_thread_map *threads,
 					 int thread, int err)
 {
@@ -1692,7 +1693,7 @@ static bool evsel__ignore_missing_thread(struct evsel *evsel,
 	 * We should remove fd for missing_thread first
 	 * because thread_map__remove() will decrease threads->nr.
 	 */
-	if (update_fds(evsel, nr_cpus, cpu, threads->nr, thread))
+	if (update_fds(evsel, nr_cpus, cpu_map_idx, threads->nr, thread))
 		return false;
 
 	if (thread_map__remove(threads, thread))
@@ -1974,9 +1975,9 @@ bool evsel__increase_rlimit(enum rlimit_action *set_rlimit)
 
 static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
 		struct perf_thread_map *threads,
-		int start_cpu, int end_cpu)
+		int start_cpu_map_idx, int end_cpu_map_idx)
 {
-	int cpu, thread, nthreads;
+	int idx, thread, nthreads;
 	int pid = -1, err, old_errno;
 	enum rlimit_action set_rlimit = NO_CHANGE;
 
@@ -2003,7 +2004,7 @@ fallback_missing_features:
 
 	display_attr(&evsel->core.attr);
 
-	for (cpu = start_cpu; cpu < end_cpu; cpu++) {
+	for (idx = start_cpu_map_idx; idx < end_cpu_map_idx; idx++) {
 
 		for (thread = 0; thread < nthreads; thread++) {
 			int fd, group_fd;
@@ -2014,17 +2015,17 @@ retry_open:
 			if (!evsel->cgrp && !evsel->core.system_wide)
 				pid = perf_thread_map__pid(threads, thread);
 
-			group_fd = get_group_fd(evsel, cpu, thread);
+			group_fd = get_group_fd(evsel, idx, thread);
 
 			test_attr__ready();
 
 			pr_debug2_peo("sys_perf_event_open: pid %d  cpu %d  group_fd %d  flags %#lx",
-				pid, cpus->map[cpu], group_fd, evsel->open_flags);
+				pid, cpus->map[idx], group_fd, evsel->open_flags);
 
-			fd = sys_perf_event_open(&evsel->core.attr, pid, cpus->map[cpu],
+			fd = sys_perf_event_open(&evsel->core.attr, pid, cpus->map[idx],
 						group_fd, evsel->open_flags);
 
-			FD(evsel, cpu, thread) = fd;
+			FD(evsel, idx, thread) = fd;
 
 			if (fd < 0) {
 				err = -errno;
@@ -2034,10 +2035,10 @@ retry_open:
 				goto try_fallback;
 			}
 
-			bpf_counter__install_pe(evsel, cpu, fd);
+			bpf_counter__install_pe(evsel, idx, fd);
 
 			if (unlikely(test_attr__enabled)) {
-				test_attr__open(&evsel->core.attr, pid, cpus->map[cpu],
+				test_attr__open(&evsel->core.attr, pid, cpus->map[idx],
 						fd, group_fd, evsel->open_flags);
 			}
 
@@ -2078,7 +2079,7 @@ try_fallback:
 	if (evsel__precise_ip_fallback(evsel))
 		goto retry_open;
 
-	if (evsel__ignore_missing_thread(evsel, cpus->nr, cpu, threads, thread, err)) {
+	if (evsel__ignore_missing_thread(evsel, cpus->nr, idx, threads, thread, err)) {
 		/* We just removed 1 thread, so lower the upper nthreads limit. */
 		nthreads--;
 
@@ -2093,7 +2094,7 @@ try_fallback:
 	if (err == -EMFILE && evsel__increase_rlimit(&set_rlimit))
 		goto retry_open;
 
-	if (err != -EINVAL || cpu > 0 || thread > 0)
+	if (err != -EINVAL || idx > 0 || thread > 0)
 		goto out_close;
 
 	if (evsel__detect_missing_features(evsel))
@@ -2105,12 +2106,12 @@ out_close:
 	old_errno = errno;
 	do {
 		while (--thread >= 0) {
-			if (FD(evsel, cpu, thread) >= 0)
-				close(FD(evsel, cpu, thread));
-			FD(evsel, cpu, thread) = -1;
+			if (FD(evsel, idx, thread) >= 0)
+				close(FD(evsel, idx, thread));
+			FD(evsel, idx, thread) = -1;
 		}
 		thread = nthreads;
-	} while (--cpu >= 0);
+	} while (--idx >= 0);
 	errno = old_errno;
 	return err;
 }
@@ -2127,13 +2128,13 @@ void evsel__close(struct evsel *evsel)
 	perf_evsel__free_id(&evsel->core);
 }
 
-int evsel__open_per_cpu(struct evsel *evsel, struct perf_cpu_map *cpus, int cpu)
+int evsel__open_per_cpu(struct evsel *evsel, struct perf_cpu_map *cpus, int cpu_map_idx)
 {
-	if (cpu == -1)
+	if (cpu_map_idx == -1)
 		return evsel__open_cpu(evsel, cpus, NULL, 0,
 					cpus ? cpus->nr : 1);
 
-	return evsel__open_cpu(evsel, cpus, NULL, cpu, cpu + 1);
+	return evsel__open_cpu(evsel, cpus, NULL, cpu_map_idx, cpu_map_idx + 1);
 }
 
 int evsel__open_per_thread(struct evsel *evsel, struct perf_thread_map *threads)
@@ -2958,15 +2959,15 @@ struct perf_env *evsel__env(struct evsel *evsel)
 
 static int store_evsel_ids(struct evsel *evsel, struct evlist *evlist)
 {
-	int cpu, thread;
+	int cpu_map_idx, thread;
 
-	for (cpu = 0; cpu < xyarray__max_x(evsel->core.fd); cpu++) {
+	for (cpu_map_idx = 0; cpu_map_idx < xyarray__max_x(evsel->core.fd); cpu_map_idx++) {
 		for (thread = 0; thread < xyarray__max_y(evsel->core.fd);
 		     thread++) {
-			int fd = FD(evsel, cpu, thread);
+			int fd = FD(evsel, cpu_map_idx, thread);
 
 			if (perf_evlist__id_add_fd(&evlist->core, &evsel->core,
-						   cpu, thread, fd) < 0)
+						   cpu_map_idx, thread, fd) < 0)
 				return -1;
 		}
 	}
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 84e597f6c395..5720ceebffac 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -284,12 +284,12 @@ void arch_evsel__fixup_new_cycles(struct perf_event_attr *attr);
 int evsel__set_filter(struct evsel *evsel, const char *filter);
 int evsel__append_tp_filter(struct evsel *evsel, const char *filter);
 int evsel__append_addr_filter(struct evsel *evsel, const char *filter);
-int evsel__enable_cpu(struct evsel *evsel, int cpu);
+int evsel__enable_cpu(struct evsel *evsel, int cpu_map_idx);
 int evsel__enable(struct evsel *evsel);
 int evsel__disable(struct evsel *evsel);
-int evsel__disable_cpu(struct evsel *evsel, int cpu);
+int evsel__disable_cpu(struct evsel *evsel, int cpu_map_idx);
 
-int evsel__open_per_cpu(struct evsel *evsel, struct perf_cpu_map *cpus, int cpu);
+int evsel__open_per_cpu(struct evsel *evsel, struct perf_cpu_map *cpus, int cpu_map_idx);
 int evsel__open_per_thread(struct evsel *evsel, struct perf_thread_map *threads);
 int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus,
 		struct perf_thread_map *threads);
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index 995cb5003133..f7f9757eba23 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -531,7 +531,7 @@ size_t perf_event__fprintf_stat_config(union perf_event *event, FILE *fp)
 int create_perf_stat_counter(struct evsel *evsel,
 			     struct perf_stat_config *config,
 			     struct target *target,
-			     int cpu)
+			     int cpu_map_idx)
 {
 	struct perf_event_attr *attr = &evsel->core.attr;
 	struct evsel *leader = evsel__leader(evsel);
@@ -585,7 +585,7 @@ int create_perf_stat_counter(struct evsel *evsel,
 	}
 
 	if (target__has_cpu(target) && !target__has_per_thread(target))
-		return evsel__open_per_cpu(evsel, evsel__cpus(evsel), cpu);
+		return evsel__open_per_cpu(evsel, evsel__cpus(evsel), cpu_map_idx);
 
 	return evsel__open_per_thread(evsel, evsel->core.threads);
 }
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index 32cf24186229..5e25d53e891b 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -248,7 +248,7 @@ size_t perf_event__fprintf_stat_config(union perf_event *event, FILE *fp);
 int create_perf_stat_counter(struct evsel *evsel,
 			     struct perf_stat_config *config,
 			     struct target *target,
-			     int cpu);
+			     int cpu_map_idx);
 void evlist__print_counters(struct evlist *evlist, struct perf_stat_config *config,
 			    struct target *_target, struct timespec *ts, int argc, const char **argv);
 
-- 
cgit v1.2.3


From aa11e55a39950c0151e12abd30c7223dfc6f6a2c Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:43 -0800
Subject: perf test: Use perf_cpu_map__for_each_cpu()

Clean up variable naming to make cpu and index clearer.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-41-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/openat-syscall-all-cpus.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/openat-syscall-all-cpus.c b/tools/perf/tests/openat-syscall-all-cpus.c
index 544db0839b3b..ca0a50e92839 100644
--- a/tools/perf/tests/openat-syscall-all-cpus.c
+++ b/tools/perf/tests/openat-syscall-all-cpus.c
@@ -22,7 +22,7 @@
 static int test__openat_syscall_event_on_all_cpus(struct test_suite *test __maybe_unused,
 						  int subtest __maybe_unused)
 {
-	int err = -1, fd, cpu;
+	int err = -1, fd, idx, cpu;
 	struct perf_cpu_map *cpus;
 	struct evsel *evsel;
 	unsigned int nr_openat_calls = 111, i;
@@ -58,23 +58,23 @@ static int test__openat_syscall_event_on_all_cpus(struct test_suite *test __mayb
 		goto out_evsel_delete;
 	}
 
-	for (cpu = 0; cpu < cpus->nr; ++cpu) {
-		unsigned int ncalls = nr_openat_calls + cpu;
+	perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
+		unsigned int ncalls = nr_openat_calls + idx;
 		/*
 		 * XXX eventually lift this restriction in a way that
 		 * keeps perf building on older glibc installations
 		 * without CPU_ALLOC. 1024 cpus in 2010 still seems
 		 * a reasonable upper limit tho :-)
 		 */
-		if (cpus->map[cpu] >= CPU_SETSIZE) {
-			pr_debug("Ignoring CPU %d\n", cpus->map[cpu]);
+		if (cpu >= CPU_SETSIZE) {
+			pr_debug("Ignoring CPU %d\n", cpu);
 			continue;
 		}
 
-		CPU_SET(cpus->map[cpu], &cpu_set);
+		CPU_SET(cpu, &cpu_set);
 		if (sched_setaffinity(0, sizeof(cpu_set), &cpu_set) < 0) {
 			pr_debug("sched_setaffinity() failed on CPU %d: %s ",
-				 cpus->map[cpu],
+				 cpu,
 				 str_error_r(errno, sbuf, sizeof(sbuf)));
 			goto out_close_fd;
 		}
@@ -82,29 +82,29 @@ static int test__openat_syscall_event_on_all_cpus(struct test_suite *test __mayb
 			fd = openat(0, "/etc/passwd", O_RDONLY);
 			close(fd);
 		}
-		CPU_CLR(cpus->map[cpu], &cpu_set);
+		CPU_CLR(cpu, &cpu_set);
 	}
 
 	evsel->core.cpus = perf_cpu_map__get(cpus);
 
 	err = 0;
 
-	for (cpu = 0; cpu < cpus->nr; ++cpu) {
+	perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
 		unsigned int expected;
 
-		if (cpus->map[cpu] >= CPU_SETSIZE)
+		if (cpu >= CPU_SETSIZE)
 			continue;
 
-		if (evsel__read_on_cpu(evsel, cpu, 0) < 0) {
+		if (evsel__read_on_cpu(evsel, idx, 0) < 0) {
 			pr_debug("evsel__read_on_cpu\n");
 			err = -1;
 			break;
 		}
 
-		expected = nr_openat_calls + cpu;
-		if (perf_counts(evsel->counts, cpu, 0)->val != expected) {
+		expected = nr_openat_calls + idx;
+		if (perf_counts(evsel->counts, idx, 0)->val != expected) {
 			pr_debug("evsel__read_on_cpu: expected to intercept %d calls on cpu %d, got %" PRIu64 "\n",
-				 expected, cpus->map[cpu], perf_counts(evsel->counts, cpu, 0)->val);
+				 expected, cpu, perf_counts(evsel->counts, idx, 0)->val);
 			err = -1;
 		}
 	}
-- 
cgit v1.2.3


From 379c224bef724cf52bd3d2364d29fc63f3e743d3 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:44 -0800
Subject: perf stat: Correct check_per_pkg() cpu

Code was incorrectly using the cpu map index as the CPU.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-42-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/stat.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index f7f9757eba23..86ab427e87fc 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -292,11 +292,12 @@ static bool pkg_id_equal(const void *__key1, const void *__key2,
 	return *key1 == *key2;
 }
 
-static int check_per_pkg(struct evsel *counter,
-			 struct perf_counts_values *vals, int cpu, bool *skip)
+static int check_per_pkg(struct evsel *counter, struct perf_counts_values *vals,
+			 int cpu_map_idx, bool *skip)
 {
 	struct hashmap *mask = counter->per_pkg_mask;
 	struct perf_cpu_map *cpus = evsel__cpus(counter);
+	int cpu = perf_cpu_map__cpu(cpus, cpu_map_idx);
 	int s, d, ret = 0;
 	uint64_t *key;
 
-- 
cgit v1.2.3


From 5b1af93dbc7e64ab1b872129cfe1f2318cc29c67 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:45 -0800
Subject: perf stat: Swap variable name cpu to index

The use of CPU is error prone, switch to cpu_map_idx.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-43-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/stat-shadow.c | 308 +++++++++++++++++++++---------------------
 tools/perf/util/stat.c        |  16 +--
 tools/perf/util/stat.h        |   4 +-
 3 files changed, 164 insertions(+), 164 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index 5c7308efa768..10af7804e482 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -32,7 +32,7 @@ struct saved_value {
 	struct evsel *evsel;
 	enum stat_type type;
 	int ctx;
-	int cpu;
+	int cpu_map_idx;
 	struct cgroup *cgrp;
 	struct runtime_stat *stat;
 	struct stats stats;
@@ -47,8 +47,8 @@ static int saved_value_cmp(struct rb_node *rb_node, const void *entry)
 					     rb_node);
 	const struct saved_value *b = entry;
 
-	if (a->cpu != b->cpu)
-		return a->cpu - b->cpu;
+	if (a->cpu_map_idx != b->cpu_map_idx)
+		return a->cpu_map_idx - b->cpu_map_idx;
 
 	/*
 	 * Previously the rbtree was used to link generic metrics.
@@ -105,7 +105,7 @@ static void saved_value_delete(struct rblist *rblist __maybe_unused,
 }
 
 static struct saved_value *saved_value_lookup(struct evsel *evsel,
-					      int cpu,
+					      int cpu_map_idx,
 					      bool create,
 					      enum stat_type type,
 					      int ctx,
@@ -115,7 +115,7 @@ static struct saved_value *saved_value_lookup(struct evsel *evsel,
 	struct rblist *rblist;
 	struct rb_node *nd;
 	struct saved_value dm = {
-		.cpu = cpu,
+		.cpu_map_idx = cpu_map_idx,
 		.evsel = evsel,
 		.type = type,
 		.ctx = ctx,
@@ -213,10 +213,10 @@ struct runtime_stat_data {
 
 static void update_runtime_stat(struct runtime_stat *st,
 				enum stat_type type,
-				int cpu, u64 count,
+				int cpu_map_idx, u64 count,
 				struct runtime_stat_data *rsd)
 {
-	struct saved_value *v = saved_value_lookup(NULL, cpu, true, type,
+	struct saved_value *v = saved_value_lookup(NULL, cpu_map_idx, true, type,
 						   rsd->ctx, st, rsd->cgrp);
 
 	if (v)
@@ -229,7 +229,7 @@ static void update_runtime_stat(struct runtime_stat *st,
  * instruction rates, etc:
  */
 void perf_stat__update_shadow_stats(struct evsel *counter, u64 count,
-				    int cpu, struct runtime_stat *st)
+				    int cpu_map_idx, struct runtime_stat *st)
 {
 	u64 count_ns = count;
 	struct saved_value *v;
@@ -241,88 +241,88 @@ void perf_stat__update_shadow_stats(struct evsel *counter, u64 count,
 	count *= counter->scale;
 
 	if (evsel__is_clock(counter))
-		update_runtime_stat(st, STAT_NSECS, cpu, count_ns, &rsd);
+		update_runtime_stat(st, STAT_NSECS, cpu_map_idx, count_ns, &rsd);
 	else if (evsel__match(counter, HARDWARE, HW_CPU_CYCLES))
-		update_runtime_stat(st, STAT_CYCLES, cpu, count, &rsd);
+		update_runtime_stat(st, STAT_CYCLES, cpu_map_idx, count, &rsd);
 	else if (perf_stat_evsel__is(counter, CYCLES_IN_TX))
-		update_runtime_stat(st, STAT_CYCLES_IN_TX, cpu, count, &rsd);
+		update_runtime_stat(st, STAT_CYCLES_IN_TX, cpu_map_idx, count, &rsd);
 	else if (perf_stat_evsel__is(counter, TRANSACTION_START))
-		update_runtime_stat(st, STAT_TRANSACTION, cpu, count, &rsd);
+		update_runtime_stat(st, STAT_TRANSACTION, cpu_map_idx, count, &rsd);
 	else if (perf_stat_evsel__is(counter, ELISION_START))
-		update_runtime_stat(st, STAT_ELISION, cpu, count, &rsd);
+		update_runtime_stat(st, STAT_ELISION, cpu_map_idx, count, &rsd);
 	else if (perf_stat_evsel__is(counter, TOPDOWN_TOTAL_SLOTS))
 		update_runtime_stat(st, STAT_TOPDOWN_TOTAL_SLOTS,
-				    cpu, count, &rsd);
+				    cpu_map_idx, count, &rsd);
 	else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_ISSUED))
 		update_runtime_stat(st, STAT_TOPDOWN_SLOTS_ISSUED,
-				    cpu, count, &rsd);
+				    cpu_map_idx, count, &rsd);
 	else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_RETIRED))
 		update_runtime_stat(st, STAT_TOPDOWN_SLOTS_RETIRED,
-				    cpu, count, &rsd);
+				    cpu_map_idx, count, &rsd);
 	else if (perf_stat_evsel__is(counter, TOPDOWN_FETCH_BUBBLES))
 		update_runtime_stat(st, STAT_TOPDOWN_FETCH_BUBBLES,
-				    cpu, count, &rsd);
+				    cpu_map_idx, count, &rsd);
 	else if (perf_stat_evsel__is(counter, TOPDOWN_RECOVERY_BUBBLES))
 		update_runtime_stat(st, STAT_TOPDOWN_RECOVERY_BUBBLES,
-				    cpu, count, &rsd);
+				    cpu_map_idx, count, &rsd);
 	else if (perf_stat_evsel__is(counter, TOPDOWN_RETIRING))
 		update_runtime_stat(st, STAT_TOPDOWN_RETIRING,
-				    cpu, count, &rsd);
+				    cpu_map_idx, count, &rsd);
 	else if (perf_stat_evsel__is(counter, TOPDOWN_BAD_SPEC))
 		update_runtime_stat(st, STAT_TOPDOWN_BAD_SPEC,
-				    cpu, count, &rsd);
+				    cpu_map_idx, count, &rsd);
 	else if (perf_stat_evsel__is(counter, TOPDOWN_FE_BOUND))
 		update_runtime_stat(st, STAT_TOPDOWN_FE_BOUND,
-				    cpu, count, &rsd);
+				    cpu_map_idx, count, &rsd);
 	else if (perf_stat_evsel__is(counter, TOPDOWN_BE_BOUND))
 		update_runtime_stat(st, STAT_TOPDOWN_BE_BOUND,
-				    cpu, count, &rsd);
+				    cpu_map_idx, count, &rsd);
 	else if (perf_stat_evsel__is(counter, TOPDOWN_HEAVY_OPS))
 		update_runtime_stat(st, STAT_TOPDOWN_HEAVY_OPS,
-				    cpu, count, &rsd);
+				    cpu_map_idx, count, &rsd);
 	else if (perf_stat_evsel__is(counter, TOPDOWN_BR_MISPREDICT))
 		update_runtime_stat(st, STAT_TOPDOWN_BR_MISPREDICT,
-				    cpu, count, &rsd);
+				    cpu_map_idx, count, &rsd);
 	else if (perf_stat_evsel__is(counter, TOPDOWN_FETCH_LAT))
 		update_runtime_stat(st, STAT_TOPDOWN_FETCH_LAT,
-				    cpu, count, &rsd);
+				    cpu_map_idx, count, &rsd);
 	else if (perf_stat_evsel__is(counter, TOPDOWN_MEM_BOUND))
 		update_runtime_stat(st, STAT_TOPDOWN_MEM_BOUND,
-				    cpu, count, &rsd);
+				    cpu_map_idx, count, &rsd);
 	else if (evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND))
 		update_runtime_stat(st, STAT_STALLED_CYCLES_FRONT,
-				    cpu, count, &rsd);
+				    cpu_map_idx, count, &rsd);
 	else if (evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND))
 		update_runtime_stat(st, STAT_STALLED_CYCLES_BACK,
-				    cpu, count, &rsd);
+				    cpu_map_idx, count, &rsd);
 	else if (evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS))
-		update_runtime_stat(st, STAT_BRANCHES, cpu, count, &rsd);
+		update_runtime_stat(st, STAT_BRANCHES, cpu_map_idx, count, &rsd);
 	else if (evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES))
-		update_runtime_stat(st, STAT_CACHEREFS, cpu, count, &rsd);
+		update_runtime_stat(st, STAT_CACHEREFS, cpu_map_idx, count, &rsd);
 	else if (evsel__match(counter, HW_CACHE, HW_CACHE_L1D))
-		update_runtime_stat(st, STAT_L1_DCACHE, cpu, count, &rsd);
+		update_runtime_stat(st, STAT_L1_DCACHE, cpu_map_idx, count, &rsd);
 	else if (evsel__match(counter, HW_CACHE, HW_CACHE_L1I))
-		update_runtime_stat(st, STAT_L1_ICACHE, cpu, count, &rsd);
+		update_runtime_stat(st, STAT_L1_ICACHE, cpu_map_idx, count, &rsd);
 	else if (evsel__match(counter, HW_CACHE, HW_CACHE_LL))
-		update_runtime_stat(st, STAT_LL_CACHE, cpu, count, &rsd);
+		update_runtime_stat(st, STAT_LL_CACHE, cpu_map_idx, count, &rsd);
 	else if (evsel__match(counter, HW_CACHE, HW_CACHE_DTLB))
-		update_runtime_stat(st, STAT_DTLB_CACHE, cpu, count, &rsd);
+		update_runtime_stat(st, STAT_DTLB_CACHE, cpu_map_idx, count, &rsd);
 	else if (evsel__match(counter, HW_CACHE, HW_CACHE_ITLB))
-		update_runtime_stat(st, STAT_ITLB_CACHE, cpu, count, &rsd);
+		update_runtime_stat(st, STAT_ITLB_CACHE, cpu_map_idx, count, &rsd);
 	else if (perf_stat_evsel__is(counter, SMI_NUM))
-		update_runtime_stat(st, STAT_SMI_NUM, cpu, count, &rsd);
+		update_runtime_stat(st, STAT_SMI_NUM, cpu_map_idx, count, &rsd);
 	else if (perf_stat_evsel__is(counter, APERF))
-		update_runtime_stat(st, STAT_APERF, cpu, count, &rsd);
+		update_runtime_stat(st, STAT_APERF, cpu_map_idx, count, &rsd);
 
 	if (counter->collect_stat) {
-		v = saved_value_lookup(counter, cpu, true, STAT_NONE, 0, st,
+		v = saved_value_lookup(counter, cpu_map_idx, true, STAT_NONE, 0, st,
 				       rsd.cgrp);
 		update_stats(&v->stats, count);
 		if (counter->metric_leader)
 			v->metric_total += count;
 	} else if (counter->metric_leader) {
 		v = saved_value_lookup(counter->metric_leader,
-				       cpu, true, STAT_NONE, 0, st, rsd.cgrp);
+				       cpu_map_idx, true, STAT_NONE, 0, st, rsd.cgrp);
 		v->metric_total += count;
 		v->metric_other++;
 	}
@@ -464,12 +464,12 @@ void perf_stat__collect_metric_expr(struct evlist *evsel_list)
 }
 
 static double runtime_stat_avg(struct runtime_stat *st,
-			       enum stat_type type, int cpu,
+			       enum stat_type type, int cpu_map_idx,
 			       struct runtime_stat_data *rsd)
 {
 	struct saved_value *v;
 
-	v = saved_value_lookup(NULL, cpu, false, type, rsd->ctx, st, rsd->cgrp);
+	v = saved_value_lookup(NULL, cpu_map_idx, false, type, rsd->ctx, st, rsd->cgrp);
 	if (!v)
 		return 0.0;
 
@@ -477,12 +477,12 @@ static double runtime_stat_avg(struct runtime_stat *st,
 }
 
 static double runtime_stat_n(struct runtime_stat *st,
-			     enum stat_type type, int cpu,
+			     enum stat_type type, int cpu_map_idx,
 			     struct runtime_stat_data *rsd)
 {
 	struct saved_value *v;
 
-	v = saved_value_lookup(NULL, cpu, false, type, rsd->ctx, st, rsd->cgrp);
+	v = saved_value_lookup(NULL, cpu_map_idx, false, type, rsd->ctx, st, rsd->cgrp);
 	if (!v)
 		return 0.0;
 
@@ -490,7 +490,7 @@ static double runtime_stat_n(struct runtime_stat *st,
 }
 
 static void print_stalled_cycles_frontend(struct perf_stat_config *config,
-					  int cpu, double avg,
+					  int cpu_map_idx, double avg,
 					  struct perf_stat_output_ctx *out,
 					  struct runtime_stat *st,
 					  struct runtime_stat_data *rsd)
@@ -498,7 +498,7 @@ static void print_stalled_cycles_frontend(struct perf_stat_config *config,
 	double total, ratio = 0.0;
 	const char *color;
 
-	total = runtime_stat_avg(st, STAT_CYCLES, cpu, rsd);
+	total = runtime_stat_avg(st, STAT_CYCLES, cpu_map_idx, rsd);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -513,7 +513,7 @@ static void print_stalled_cycles_frontend(struct perf_stat_config *config,
 }
 
 static void print_stalled_cycles_backend(struct perf_stat_config *config,
-					 int cpu, double avg,
+					 int cpu_map_idx, double avg,
 					 struct perf_stat_output_ctx *out,
 					 struct runtime_stat *st,
 					 struct runtime_stat_data *rsd)
@@ -521,7 +521,7 @@ static void print_stalled_cycles_backend(struct perf_stat_config *config,
 	double total, ratio = 0.0;
 	const char *color;
 
-	total = runtime_stat_avg(st, STAT_CYCLES, cpu, rsd);
+	total = runtime_stat_avg(st, STAT_CYCLES, cpu_map_idx, rsd);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -532,7 +532,7 @@ static void print_stalled_cycles_backend(struct perf_stat_config *config,
 }
 
 static void print_branch_misses(struct perf_stat_config *config,
-				int cpu, double avg,
+				int cpu_map_idx, double avg,
 				struct perf_stat_output_ctx *out,
 				struct runtime_stat *st,
 				struct runtime_stat_data *rsd)
@@ -540,7 +540,7 @@ static void print_branch_misses(struct perf_stat_config *config,
 	double total, ratio = 0.0;
 	const char *color;
 
-	total = runtime_stat_avg(st, STAT_BRANCHES, cpu, rsd);
+	total = runtime_stat_avg(st, STAT_BRANCHES, cpu_map_idx, rsd);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -551,7 +551,7 @@ static void print_branch_misses(struct perf_stat_config *config,
 }
 
 static void print_l1_dcache_misses(struct perf_stat_config *config,
-				   int cpu, double avg,
+				   int cpu_map_idx, double avg,
 				   struct perf_stat_output_ctx *out,
 				   struct runtime_stat *st,
 				   struct runtime_stat_data *rsd)
@@ -559,7 +559,7 @@ static void print_l1_dcache_misses(struct perf_stat_config *config,
 	double total, ratio = 0.0;
 	const char *color;
 
-	total = runtime_stat_avg(st, STAT_L1_DCACHE, cpu, rsd);
+	total = runtime_stat_avg(st, STAT_L1_DCACHE, cpu_map_idx, rsd);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -570,7 +570,7 @@ static void print_l1_dcache_misses(struct perf_stat_config *config,
 }
 
 static void print_l1_icache_misses(struct perf_stat_config *config,
-				   int cpu, double avg,
+				   int cpu_map_idx, double avg,
 				   struct perf_stat_output_ctx *out,
 				   struct runtime_stat *st,
 				   struct runtime_stat_data *rsd)
@@ -578,7 +578,7 @@ static void print_l1_icache_misses(struct perf_stat_config *config,
 	double total, ratio = 0.0;
 	const char *color;
 
-	total = runtime_stat_avg(st, STAT_L1_ICACHE, cpu, rsd);
+	total = runtime_stat_avg(st, STAT_L1_ICACHE, cpu_map_idx, rsd);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -588,7 +588,7 @@ static void print_l1_icache_misses(struct perf_stat_config *config,
 }
 
 static void print_dtlb_cache_misses(struct perf_stat_config *config,
-				    int cpu, double avg,
+				    int cpu_map_idx, double avg,
 				    struct perf_stat_output_ctx *out,
 				    struct runtime_stat *st,
 				    struct runtime_stat_data *rsd)
@@ -596,7 +596,7 @@ static void print_dtlb_cache_misses(struct perf_stat_config *config,
 	double total, ratio = 0.0;
 	const char *color;
 
-	total = runtime_stat_avg(st, STAT_DTLB_CACHE, cpu, rsd);
+	total = runtime_stat_avg(st, STAT_DTLB_CACHE, cpu_map_idx, rsd);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -606,7 +606,7 @@ static void print_dtlb_cache_misses(struct perf_stat_config *config,
 }
 
 static void print_itlb_cache_misses(struct perf_stat_config *config,
-				    int cpu, double avg,
+				    int cpu_map_idx, double avg,
 				    struct perf_stat_output_ctx *out,
 				    struct runtime_stat *st,
 				    struct runtime_stat_data *rsd)
@@ -614,7 +614,7 @@ static void print_itlb_cache_misses(struct perf_stat_config *config,
 	double total, ratio = 0.0;
 	const char *color;
 
-	total = runtime_stat_avg(st, STAT_ITLB_CACHE, cpu, rsd);
+	total = runtime_stat_avg(st, STAT_ITLB_CACHE, cpu_map_idx, rsd);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -624,7 +624,7 @@ static void print_itlb_cache_misses(struct perf_stat_config *config,
 }
 
 static void print_ll_cache_misses(struct perf_stat_config *config,
-				  int cpu, double avg,
+				  int cpu_map_idx, double avg,
 				  struct perf_stat_output_ctx *out,
 				  struct runtime_stat *st,
 				  struct runtime_stat_data *rsd)
@@ -632,7 +632,7 @@ static void print_ll_cache_misses(struct perf_stat_config *config,
 	double total, ratio = 0.0;
 	const char *color;
 
-	total = runtime_stat_avg(st, STAT_LL_CACHE, cpu, rsd);
+	total = runtime_stat_avg(st, STAT_LL_CACHE, cpu_map_idx, rsd);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -690,61 +690,61 @@ static double sanitize_val(double x)
 	return x;
 }
 
-static double td_total_slots(int cpu, struct runtime_stat *st,
+static double td_total_slots(int cpu_map_idx, struct runtime_stat *st,
 			     struct runtime_stat_data *rsd)
 {
-	return runtime_stat_avg(st, STAT_TOPDOWN_TOTAL_SLOTS, cpu, rsd);
+	return runtime_stat_avg(st, STAT_TOPDOWN_TOTAL_SLOTS, cpu_map_idx, rsd);
 }
 
-static double td_bad_spec(int cpu, struct runtime_stat *st,
+static double td_bad_spec(int cpu_map_idx, struct runtime_stat *st,
 			  struct runtime_stat_data *rsd)
 {
 	double bad_spec = 0;
 	double total_slots;
 	double total;
 
-	total = runtime_stat_avg(st, STAT_TOPDOWN_SLOTS_ISSUED, cpu, rsd) -
-		runtime_stat_avg(st, STAT_TOPDOWN_SLOTS_RETIRED, cpu, rsd) +
-		runtime_stat_avg(st, STAT_TOPDOWN_RECOVERY_BUBBLES, cpu, rsd);
+	total = runtime_stat_avg(st, STAT_TOPDOWN_SLOTS_ISSUED, cpu_map_idx, rsd) -
+		runtime_stat_avg(st, STAT_TOPDOWN_SLOTS_RETIRED, cpu_map_idx, rsd) +
+		runtime_stat_avg(st, STAT_TOPDOWN_RECOVERY_BUBBLES, cpu_map_idx, rsd);
 
-	total_slots = td_total_slots(cpu, st, rsd);
+	total_slots = td_total_slots(cpu_map_idx, st, rsd);
 	if (total_slots)
 		bad_spec = total / total_slots;
 	return sanitize_val(bad_spec);
 }
 
-static double td_retiring(int cpu, struct runtime_stat *st,
+static double td_retiring(int cpu_map_idx, struct runtime_stat *st,
 			  struct runtime_stat_data *rsd)
 {
 	double retiring = 0;
-	double total_slots = td_total_slots(cpu, st, rsd);
+	double total_slots = td_total_slots(cpu_map_idx, st, rsd);
 	double ret_slots = runtime_stat_avg(st, STAT_TOPDOWN_SLOTS_RETIRED,
-					    cpu, rsd);
+					    cpu_map_idx, rsd);
 
 	if (total_slots)
 		retiring = ret_slots / total_slots;
 	return retiring;
 }
 
-static double td_fe_bound(int cpu, struct runtime_stat *st,
+static double td_fe_bound(int cpu_map_idx, struct runtime_stat *st,
 			  struct runtime_stat_data *rsd)
 {
 	double fe_bound = 0;
-	double total_slots = td_total_slots(cpu, st, rsd);
+	double total_slots = td_total_slots(cpu_map_idx, st, rsd);
 	double fetch_bub = runtime_stat_avg(st, STAT_TOPDOWN_FETCH_BUBBLES,
-					    cpu, rsd);
+					    cpu_map_idx, rsd);
 
 	if (total_slots)
 		fe_bound = fetch_bub / total_slots;
 	return fe_bound;
 }
 
-static double td_be_bound(int cpu, struct runtime_stat *st,
+static double td_be_bound(int cpu_map_idx, struct runtime_stat *st,
 			  struct runtime_stat_data *rsd)
 {
-	double sum = (td_fe_bound(cpu, st, rsd) +
-		      td_bad_spec(cpu, st, rsd) +
-		      td_retiring(cpu, st, rsd));
+	double sum = (td_fe_bound(cpu_map_idx, st, rsd) +
+		      td_bad_spec(cpu_map_idx, st, rsd) +
+		      td_retiring(cpu_map_idx, st, rsd));
 	if (sum == 0)
 		return 0;
 	return sanitize_val(1.0 - sum);
@@ -755,15 +755,15 @@ static double td_be_bound(int cpu, struct runtime_stat *st,
  * the ratios we need to recreate the sum.
  */
 
-static double td_metric_ratio(int cpu, enum stat_type type,
+static double td_metric_ratio(int cpu_map_idx, enum stat_type type,
 			      struct runtime_stat *stat,
 			      struct runtime_stat_data *rsd)
 {
-	double sum = runtime_stat_avg(stat, STAT_TOPDOWN_RETIRING, cpu, rsd) +
-		runtime_stat_avg(stat, STAT_TOPDOWN_FE_BOUND, cpu, rsd) +
-		runtime_stat_avg(stat, STAT_TOPDOWN_BE_BOUND, cpu, rsd) +
-		runtime_stat_avg(stat, STAT_TOPDOWN_BAD_SPEC, cpu, rsd);
-	double d = runtime_stat_avg(stat, type, cpu, rsd);
+	double sum = runtime_stat_avg(stat, STAT_TOPDOWN_RETIRING, cpu_map_idx, rsd) +
+		runtime_stat_avg(stat, STAT_TOPDOWN_FE_BOUND, cpu_map_idx, rsd) +
+		runtime_stat_avg(stat, STAT_TOPDOWN_BE_BOUND, cpu_map_idx, rsd) +
+		runtime_stat_avg(stat, STAT_TOPDOWN_BAD_SPEC, cpu_map_idx, rsd);
+	double d = runtime_stat_avg(stat, type, cpu_map_idx, rsd);
 
 	if (sum)
 		return d / sum;
@@ -775,23 +775,23 @@ static double td_metric_ratio(int cpu, enum stat_type type,
  * We allow two missing.
  */
 
-static bool full_td(int cpu, struct runtime_stat *stat,
+static bool full_td(int cpu_map_idx, struct runtime_stat *stat,
 		    struct runtime_stat_data *rsd)
 {
 	int c = 0;
 
-	if (runtime_stat_avg(stat, STAT_TOPDOWN_RETIRING, cpu, rsd) > 0)
+	if (runtime_stat_avg(stat, STAT_TOPDOWN_RETIRING, cpu_map_idx, rsd) > 0)
 		c++;
-	if (runtime_stat_avg(stat, STAT_TOPDOWN_BE_BOUND, cpu, rsd) > 0)
+	if (runtime_stat_avg(stat, STAT_TOPDOWN_BE_BOUND, cpu_map_idx, rsd) > 0)
 		c++;
-	if (runtime_stat_avg(stat, STAT_TOPDOWN_FE_BOUND, cpu, rsd) > 0)
+	if (runtime_stat_avg(stat, STAT_TOPDOWN_FE_BOUND, cpu_map_idx, rsd) > 0)
 		c++;
-	if (runtime_stat_avg(stat, STAT_TOPDOWN_BAD_SPEC, cpu, rsd) > 0)
+	if (runtime_stat_avg(stat, STAT_TOPDOWN_BAD_SPEC, cpu_map_idx, rsd) > 0)
 		c++;
 	return c >= 2;
 }
 
-static void print_smi_cost(struct perf_stat_config *config, int cpu,
+static void print_smi_cost(struct perf_stat_config *config, int cpu_map_idx,
 			   struct perf_stat_output_ctx *out,
 			   struct runtime_stat *st,
 			   struct runtime_stat_data *rsd)
@@ -799,9 +799,9 @@ static void print_smi_cost(struct perf_stat_config *config, int cpu,
 	double smi_num, aperf, cycles, cost = 0.0;
 	const char *color = NULL;
 
-	smi_num = runtime_stat_avg(st, STAT_SMI_NUM, cpu, rsd);
-	aperf = runtime_stat_avg(st, STAT_APERF, cpu, rsd);
-	cycles = runtime_stat_avg(st, STAT_CYCLES, cpu, rsd);
+	smi_num = runtime_stat_avg(st, STAT_SMI_NUM, cpu_map_idx, rsd);
+	aperf = runtime_stat_avg(st, STAT_APERF, cpu_map_idx, rsd);
+	cycles = runtime_stat_avg(st, STAT_CYCLES, cpu_map_idx, rsd);
 
 	if ((cycles == 0) || (aperf == 0))
 		return;
@@ -818,7 +818,7 @@ static void print_smi_cost(struct perf_stat_config *config, int cpu,
 static int prepare_metric(struct evsel **metric_events,
 			  struct metric_ref *metric_refs,
 			  struct expr_parse_ctx *pctx,
-			  int cpu,
+			  int cpu_map_idx,
 			  struct runtime_stat *st)
 {
 	double scale;
@@ -836,7 +836,7 @@ static int prepare_metric(struct evsel **metric_events,
 			scale = 1e-9;
 			source_count = 1;
 		} else {
-			v = saved_value_lookup(metric_events[i], cpu, false,
+			v = saved_value_lookup(metric_events[i], cpu_map_idx, false,
 					       STAT_NONE, 0, st,
 					       metric_events[i]->cgrp);
 			if (!v)
@@ -874,7 +874,7 @@ static void generic_metric(struct perf_stat_config *config,
 			   const char *metric_name,
 			   const char *metric_unit,
 			   int runtime,
-			   int cpu,
+			   int cpu_map_idx,
 			   struct perf_stat_output_ctx *out,
 			   struct runtime_stat *st)
 {
@@ -889,7 +889,7 @@ static void generic_metric(struct perf_stat_config *config,
 		return;
 
 	pctx->runtime = runtime;
-	i = prepare_metric(metric_events, metric_refs, pctx, cpu, st);
+	i = prepare_metric(metric_events, metric_refs, pctx, cpu_map_idx, st);
 	if (i < 0) {
 		expr__ctx_free(pctx);
 		return;
@@ -934,7 +934,7 @@ static void generic_metric(struct perf_stat_config *config,
 	expr__ctx_free(pctx);
 }
 
-double test_generic_metric(struct metric_expr *mexp, int cpu, struct runtime_stat *st)
+double test_generic_metric(struct metric_expr *mexp, int cpu_map_idx, struct runtime_stat *st)
 {
 	struct expr_parse_ctx *pctx;
 	double ratio = 0.0;
@@ -943,7 +943,7 @@ double test_generic_metric(struct metric_expr *mexp, int cpu, struct runtime_sta
 	if (!pctx)
 		return NAN;
 
-	if (prepare_metric(mexp->metric_events, mexp->metric_refs, pctx, cpu, st) < 0)
+	if (prepare_metric(mexp->metric_events, mexp->metric_refs, pctx, cpu_map_idx, st) < 0)
 		goto out;
 
 	if (expr__parse(&ratio, pctx, mexp->metric_expr))
@@ -956,7 +956,7 @@ out:
 
 void perf_stat__print_shadow_stats(struct perf_stat_config *config,
 				   struct evsel *evsel,
-				   double avg, int cpu,
+				   double avg, int cpu_map_idx,
 				   struct perf_stat_output_ctx *out,
 				   struct rblist *metric_events,
 				   struct runtime_stat *st)
@@ -975,7 +975,7 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
 	if (config->iostat_run) {
 		iostat_print_metric(config, evsel, out);
 	} else if (evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) {
-		total = runtime_stat_avg(st, STAT_CYCLES, cpu, &rsd);
+		total = runtime_stat_avg(st, STAT_CYCLES, cpu_map_idx, &rsd);
 
 		if (total) {
 			ratio = avg / total;
@@ -985,11 +985,11 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
 			print_metric(config, ctxp, NULL, NULL, "insn per cycle", 0);
 		}
 
-		total = runtime_stat_avg(st, STAT_STALLED_CYCLES_FRONT, cpu, &rsd);
+		total = runtime_stat_avg(st, STAT_STALLED_CYCLES_FRONT, cpu_map_idx, &rsd);
 
 		total = max(total, runtime_stat_avg(st,
 						    STAT_STALLED_CYCLES_BACK,
-						    cpu, &rsd));
+						    cpu_map_idx, &rsd));
 
 		if (total && avg) {
 			out->new_line(config, ctxp);
@@ -999,8 +999,8 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
 					ratio);
 		}
 	} else if (evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES)) {
-		if (runtime_stat_n(st, STAT_BRANCHES, cpu, &rsd) != 0)
-			print_branch_misses(config, cpu, avg, out, st, &rsd);
+		if (runtime_stat_n(st, STAT_BRANCHES, cpu_map_idx, &rsd) != 0)
+			print_branch_misses(config, cpu_map_idx, avg, out, st, &rsd);
 		else
 			print_metric(config, ctxp, NULL, NULL, "of all branches", 0);
 	} else if (
@@ -1009,8 +1009,8 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
 					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
 					 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
 
-		if (runtime_stat_n(st, STAT_L1_DCACHE, cpu, &rsd) != 0)
-			print_l1_dcache_misses(config, cpu, avg, out, st, &rsd);
+		if (runtime_stat_n(st, STAT_L1_DCACHE, cpu_map_idx, &rsd) != 0)
+			print_l1_dcache_misses(config, cpu_map_idx, avg, out, st, &rsd);
 		else
 			print_metric(config, ctxp, NULL, NULL, "of all L1-dcache accesses", 0);
 	} else if (
@@ -1019,8 +1019,8 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
 					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
 					 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
 
-		if (runtime_stat_n(st, STAT_L1_ICACHE, cpu, &rsd) != 0)
-			print_l1_icache_misses(config, cpu, avg, out, st, &rsd);
+		if (runtime_stat_n(st, STAT_L1_ICACHE, cpu_map_idx, &rsd) != 0)
+			print_l1_icache_misses(config, cpu_map_idx, avg, out, st, &rsd);
 		else
 			print_metric(config, ctxp, NULL, NULL, "of all L1-icache accesses", 0);
 	} else if (
@@ -1029,8 +1029,8 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
 					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
 					 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
 
-		if (runtime_stat_n(st, STAT_DTLB_CACHE, cpu, &rsd) != 0)
-			print_dtlb_cache_misses(config, cpu, avg, out, st, &rsd);
+		if (runtime_stat_n(st, STAT_DTLB_CACHE, cpu_map_idx, &rsd) != 0)
+			print_dtlb_cache_misses(config, cpu_map_idx, avg, out, st, &rsd);
 		else
 			print_metric(config, ctxp, NULL, NULL, "of all dTLB cache accesses", 0);
 	} else if (
@@ -1039,8 +1039,8 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
 					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
 					 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
 
-		if (runtime_stat_n(st, STAT_ITLB_CACHE, cpu, &rsd) != 0)
-			print_itlb_cache_misses(config, cpu, avg, out, st, &rsd);
+		if (runtime_stat_n(st, STAT_ITLB_CACHE, cpu_map_idx, &rsd) != 0)
+			print_itlb_cache_misses(config, cpu_map_idx, avg, out, st, &rsd);
 		else
 			print_metric(config, ctxp, NULL, NULL, "of all iTLB cache accesses", 0);
 	} else if (
@@ -1049,27 +1049,27 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
 					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
 					 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
 
-		if (runtime_stat_n(st, STAT_LL_CACHE, cpu, &rsd) != 0)
-			print_ll_cache_misses(config, cpu, avg, out, st, &rsd);
+		if (runtime_stat_n(st, STAT_LL_CACHE, cpu_map_idx, &rsd) != 0)
+			print_ll_cache_misses(config, cpu_map_idx, avg, out, st, &rsd);
 		else
 			print_metric(config, ctxp, NULL, NULL, "of all LL-cache accesses", 0);
 	} else if (evsel__match(evsel, HARDWARE, HW_CACHE_MISSES)) {
-		total = runtime_stat_avg(st, STAT_CACHEREFS, cpu, &rsd);
+		total = runtime_stat_avg(st, STAT_CACHEREFS, cpu_map_idx, &rsd);
 
 		if (total)
 			ratio = avg * 100 / total;
 
-		if (runtime_stat_n(st, STAT_CACHEREFS, cpu, &rsd) != 0)
+		if (runtime_stat_n(st, STAT_CACHEREFS, cpu_map_idx, &rsd) != 0)
 			print_metric(config, ctxp, NULL, "%8.3f %%",
 				     "of all cache refs", ratio);
 		else
 			print_metric(config, ctxp, NULL, NULL, "of all cache refs", 0);
 	} else if (evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) {
-		print_stalled_cycles_frontend(config, cpu, avg, out, st, &rsd);
+		print_stalled_cycles_frontend(config, cpu_map_idx, avg, out, st, &rsd);
 	} else if (evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) {
-		print_stalled_cycles_backend(config, cpu, avg, out, st, &rsd);
+		print_stalled_cycles_backend(config, cpu_map_idx, avg, out, st, &rsd);
 	} else if (evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) {
-		total = runtime_stat_avg(st, STAT_NSECS, cpu, &rsd);
+		total = runtime_stat_avg(st, STAT_NSECS, cpu_map_idx, &rsd);
 
 		if (total) {
 			ratio = avg / total;
@@ -1078,7 +1078,7 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
 			print_metric(config, ctxp, NULL, NULL, "Ghz", 0);
 		}
 	} else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX)) {
-		total = runtime_stat_avg(st, STAT_CYCLES, cpu, &rsd);
+		total = runtime_stat_avg(st, STAT_CYCLES, cpu_map_idx, &rsd);
 
 		if (total)
 			print_metric(config, ctxp, NULL,
@@ -1088,8 +1088,8 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
 			print_metric(config, ctxp, NULL, NULL, "transactional cycles",
 				     0);
 	} else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX_CP)) {
-		total = runtime_stat_avg(st, STAT_CYCLES, cpu, &rsd);
-		total2 = runtime_stat_avg(st, STAT_CYCLES_IN_TX, cpu, &rsd);
+		total = runtime_stat_avg(st, STAT_CYCLES, cpu_map_idx, &rsd);
+		total2 = runtime_stat_avg(st, STAT_CYCLES_IN_TX, cpu_map_idx, &rsd);
 
 		if (total2 < avg)
 			total2 = avg;
@@ -1099,19 +1099,19 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
 		else
 			print_metric(config, ctxp, NULL, NULL, "aborted cycles", 0);
 	} else if (perf_stat_evsel__is(evsel, TRANSACTION_START)) {
-		total = runtime_stat_avg(st, STAT_CYCLES_IN_TX, cpu, &rsd);
+		total = runtime_stat_avg(st, STAT_CYCLES_IN_TX, cpu_map_idx, &rsd);
 
 		if (avg)
 			ratio = total / avg;
 
-		if (runtime_stat_n(st, STAT_CYCLES_IN_TX, cpu, &rsd) != 0)
+		if (runtime_stat_n(st, STAT_CYCLES_IN_TX, cpu_map_idx, &rsd) != 0)
 			print_metric(config, ctxp, NULL, "%8.0f",
 				     "cycles / transaction", ratio);
 		else
 			print_metric(config, ctxp, NULL, NULL, "cycles / transaction",
 				      0);
 	} else if (perf_stat_evsel__is(evsel, ELISION_START)) {
-		total = runtime_stat_avg(st, STAT_CYCLES_IN_TX, cpu, &rsd);
+		total = runtime_stat_avg(st, STAT_CYCLES_IN_TX, cpu_map_idx, &rsd);
 
 		if (avg)
 			ratio = total / avg;
@@ -1124,28 +1124,28 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
 		else
 			print_metric(config, ctxp, NULL, NULL, "CPUs utilized", 0);
 	} else if (perf_stat_evsel__is(evsel, TOPDOWN_FETCH_BUBBLES)) {
-		double fe_bound = td_fe_bound(cpu, st, &rsd);
+		double fe_bound = td_fe_bound(cpu_map_idx, st, &rsd);
 
 		if (fe_bound > 0.2)
 			color = PERF_COLOR_RED;
 		print_metric(config, ctxp, color, "%8.1f%%", "frontend bound",
 				fe_bound * 100.);
 	} else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_RETIRED)) {
-		double retiring = td_retiring(cpu, st, &rsd);
+		double retiring = td_retiring(cpu_map_idx, st, &rsd);
 
 		if (retiring > 0.7)
 			color = PERF_COLOR_GREEN;
 		print_metric(config, ctxp, color, "%8.1f%%", "retiring",
 				retiring * 100.);
 	} else if (perf_stat_evsel__is(evsel, TOPDOWN_RECOVERY_BUBBLES)) {
-		double bad_spec = td_bad_spec(cpu, st, &rsd);
+		double bad_spec = td_bad_spec(cpu_map_idx, st, &rsd);
 
 		if (bad_spec > 0.1)
 			color = PERF_COLOR_RED;
 		print_metric(config, ctxp, color, "%8.1f%%", "bad speculation",
 				bad_spec * 100.);
 	} else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_ISSUED)) {
-		double be_bound = td_be_bound(cpu, st, &rsd);
+		double be_bound = td_be_bound(cpu_map_idx, st, &rsd);
 		const char *name = "backend bound";
 		static int have_recovery_bubbles = -1;
 
@@ -1158,14 +1158,14 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
 
 		if (be_bound > 0.2)
 			color = PERF_COLOR_RED;
-		if (td_total_slots(cpu, st, &rsd) > 0)
+		if (td_total_slots(cpu_map_idx, st, &rsd) > 0)
 			print_metric(config, ctxp, color, "%8.1f%%", name,
 					be_bound * 100.);
 		else
 			print_metric(config, ctxp, NULL, NULL, name, 0);
 	} else if (perf_stat_evsel__is(evsel, TOPDOWN_RETIRING) &&
-		   full_td(cpu, st, &rsd)) {
-		double retiring = td_metric_ratio(cpu,
+		   full_td(cpu_map_idx, st, &rsd)) {
+		double retiring = td_metric_ratio(cpu_map_idx,
 						  STAT_TOPDOWN_RETIRING, st,
 						  &rsd);
 		if (retiring > 0.7)
@@ -1173,8 +1173,8 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
 		print_metric(config, ctxp, color, "%8.1f%%", "retiring",
 				retiring * 100.);
 	} else if (perf_stat_evsel__is(evsel, TOPDOWN_FE_BOUND) &&
-		   full_td(cpu, st, &rsd)) {
-		double fe_bound = td_metric_ratio(cpu,
+		   full_td(cpu_map_idx, st, &rsd)) {
+		double fe_bound = td_metric_ratio(cpu_map_idx,
 						  STAT_TOPDOWN_FE_BOUND, st,
 						  &rsd);
 		if (fe_bound > 0.2)
@@ -1182,8 +1182,8 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
 		print_metric(config, ctxp, color, "%8.1f%%", "frontend bound",
 				fe_bound * 100.);
 	} else if (perf_stat_evsel__is(evsel, TOPDOWN_BE_BOUND) &&
-		   full_td(cpu, st, &rsd)) {
-		double be_bound = td_metric_ratio(cpu,
+		   full_td(cpu_map_idx, st, &rsd)) {
+		double be_bound = td_metric_ratio(cpu_map_idx,
 						  STAT_TOPDOWN_BE_BOUND, st,
 						  &rsd);
 		if (be_bound > 0.2)
@@ -1191,8 +1191,8 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
 		print_metric(config, ctxp, color, "%8.1f%%", "backend bound",
 				be_bound * 100.);
 	} else if (perf_stat_evsel__is(evsel, TOPDOWN_BAD_SPEC) &&
-		   full_td(cpu, st, &rsd)) {
-		double bad_spec = td_metric_ratio(cpu,
+		   full_td(cpu_map_idx, st, &rsd)) {
+		double bad_spec = td_metric_ratio(cpu_map_idx,
 						  STAT_TOPDOWN_BAD_SPEC, st,
 						  &rsd);
 		if (bad_spec > 0.1)
@@ -1200,11 +1200,11 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
 		print_metric(config, ctxp, color, "%8.1f%%", "bad speculation",
 				bad_spec * 100.);
 	} else if (perf_stat_evsel__is(evsel, TOPDOWN_HEAVY_OPS) &&
-			full_td(cpu, st, &rsd) && (config->topdown_level > 1)) {
-		double retiring = td_metric_ratio(cpu,
+			full_td(cpu_map_idx, st, &rsd) && (config->topdown_level > 1)) {
+		double retiring = td_metric_ratio(cpu_map_idx,
 						  STAT_TOPDOWN_RETIRING, st,
 						  &rsd);
-		double heavy_ops = td_metric_ratio(cpu,
+		double heavy_ops = td_metric_ratio(cpu_map_idx,
 						   STAT_TOPDOWN_HEAVY_OPS, st,
 						   &rsd);
 		double light_ops = retiring - heavy_ops;
@@ -1220,11 +1220,11 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
 		print_metric(config, ctxp, color, "%8.1f%%", "light operations",
 				light_ops * 100.);
 	} else if (perf_stat_evsel__is(evsel, TOPDOWN_BR_MISPREDICT) &&
-			full_td(cpu, st, &rsd) && (config->topdown_level > 1)) {
-		double bad_spec = td_metric_ratio(cpu,
+			full_td(cpu_map_idx, st, &rsd) && (config->topdown_level > 1)) {
+		double bad_spec = td_metric_ratio(cpu_map_idx,
 						  STAT_TOPDOWN_BAD_SPEC, st,
 						  &rsd);
-		double br_mis = td_metric_ratio(cpu,
+		double br_mis = td_metric_ratio(cpu_map_idx,
 						STAT_TOPDOWN_BR_MISPREDICT, st,
 						&rsd);
 		double m_clears = bad_spec - br_mis;
@@ -1240,11 +1240,11 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
 		print_metric(config, ctxp, color, "%8.1f%%", "machine clears",
 				m_clears * 100.);
 	} else if (perf_stat_evsel__is(evsel, TOPDOWN_FETCH_LAT) &&
-			full_td(cpu, st, &rsd) && (config->topdown_level > 1)) {
-		double fe_bound = td_metric_ratio(cpu,
+			full_td(cpu_map_idx, st, &rsd) && (config->topdown_level > 1)) {
+		double fe_bound = td_metric_ratio(cpu_map_idx,
 						  STAT_TOPDOWN_FE_BOUND, st,
 						  &rsd);
-		double fetch_lat = td_metric_ratio(cpu,
+		double fetch_lat = td_metric_ratio(cpu_map_idx,
 						   STAT_TOPDOWN_FETCH_LAT, st,
 						   &rsd);
 		double fetch_bw = fe_bound - fetch_lat;
@@ -1260,11 +1260,11 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
 		print_metric(config, ctxp, color, "%8.1f%%", "fetch bandwidth",
 				fetch_bw * 100.);
 	} else if (perf_stat_evsel__is(evsel, TOPDOWN_MEM_BOUND) &&
-			full_td(cpu, st, &rsd) && (config->topdown_level > 1)) {
-		double be_bound = td_metric_ratio(cpu,
+			full_td(cpu_map_idx, st, &rsd) && (config->topdown_level > 1)) {
+		double be_bound = td_metric_ratio(cpu_map_idx,
 						  STAT_TOPDOWN_BE_BOUND, st,
 						  &rsd);
-		double mem_bound = td_metric_ratio(cpu,
+		double mem_bound = td_metric_ratio(cpu_map_idx,
 						   STAT_TOPDOWN_MEM_BOUND, st,
 						   &rsd);
 		double core_bound = be_bound - mem_bound;
@@ -1281,12 +1281,12 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
 				core_bound * 100.);
 	} else if (evsel->metric_expr) {
 		generic_metric(config, evsel->metric_expr, evsel->metric_events, NULL,
-				evsel->name, evsel->metric_name, NULL, 1, cpu, out, st);
-	} else if (runtime_stat_n(st, STAT_NSECS, cpu, &rsd) != 0) {
+				evsel->name, evsel->metric_name, NULL, 1, cpu_map_idx, out, st);
+	} else if (runtime_stat_n(st, STAT_NSECS, cpu_map_idx, &rsd) != 0) {
 		char unit = ' ';
 		char unit_buf[10] = "/sec";
 
-		total = runtime_stat_avg(st, STAT_NSECS, cpu, &rsd);
+		total = runtime_stat_avg(st, STAT_NSECS, cpu_map_idx, &rsd);
 		if (total)
 			ratio = convert_unit_double(1000000000.0 * avg / total, &unit);
 
@@ -1294,7 +1294,7 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
 			snprintf(unit_buf, sizeof(unit_buf), "%c/sec", unit);
 		print_metric(config, ctxp, NULL, "%8.3f", unit_buf, ratio);
 	} else if (perf_stat_evsel__is(evsel, SMI_NUM)) {
-		print_smi_cost(config, cpu, out, st, &rsd);
+		print_smi_cost(config, cpu_map_idx, out, st, &rsd);
 	} else {
 		num = 0;
 	}
@@ -1307,7 +1307,7 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
 				out->new_line(config, ctxp);
 			generic_metric(config, mexp->metric_expr, mexp->metric_events,
 					mexp->metric_refs, evsel->name, mexp->metric_name,
-					mexp->metric_unit, mexp->runtime, cpu, out, st);
+					mexp->metric_unit, mexp->runtime, cpu_map_idx, out, st);
 		}
 	}
 	if (num == 0)
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index 86ab427e87fc..7dbd7c4f3c33 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -356,14 +356,14 @@ static int check_per_pkg(struct evsel *counter, struct perf_counts_values *vals,
 
 static int
 process_counter_values(struct perf_stat_config *config, struct evsel *evsel,
-		       int cpu, int thread,
+		       int cpu_map_idx, int thread,
 		       struct perf_counts_values *count)
 {
 	struct perf_counts_values *aggr = &evsel->counts->aggr;
 	static struct perf_counts_values zero;
 	bool skip = false;
 
-	if (check_per_pkg(evsel, count, cpu, &skip)) {
+	if (check_per_pkg(evsel, count, cpu_map_idx, &skip)) {
 		pr_err("failed to read per-pkg counter\n");
 		return -1;
 	}
@@ -379,11 +379,11 @@ process_counter_values(struct perf_stat_config *config, struct evsel *evsel,
 	case AGGR_NODE:
 	case AGGR_NONE:
 		if (!evsel->snapshot)
-			evsel__compute_deltas(evsel, cpu, thread, count);
+			evsel__compute_deltas(evsel, cpu_map_idx, thread, count);
 		perf_counts_values__scale(count, config->scale, NULL);
 		if ((config->aggr_mode == AGGR_NONE) && (!evsel->percore)) {
 			perf_stat__update_shadow_stats(evsel, count->val,
-						       cpu, &rt_stat);
+						       cpu_map_idx, &rt_stat);
 		}
 
 		if (config->aggr_mode == AGGR_THREAD) {
@@ -412,15 +412,15 @@ static int process_counter_maps(struct perf_stat_config *config,
 {
 	int nthreads = perf_thread_map__nr(counter->core.threads);
 	int ncpus = evsel__nr_cpus(counter);
-	int cpu, thread;
+	int idx, thread;
 
 	if (counter->core.system_wide)
 		nthreads = 1;
 
 	for (thread = 0; thread < nthreads; thread++) {
-		for (cpu = 0; cpu < ncpus; cpu++) {
-			if (process_counter_values(config, counter, cpu, thread,
-						   perf_counts(counter->counts, cpu, thread)))
+		for (idx = 0; idx < ncpus; idx++) {
+			if (process_counter_values(config, counter, idx, thread,
+						   perf_counts(counter->counts, idx, thread)))
 				return -1;
 		}
 	}
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index 5e25d53e891b..691c12fd8976 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -208,7 +208,7 @@ void perf_stat__init_shadow_stats(void);
 void perf_stat__reset_shadow_stats(void);
 void perf_stat__reset_shadow_per_stat(struct runtime_stat *st);
 void perf_stat__update_shadow_stats(struct evsel *counter, u64 count,
-				    int cpu, struct runtime_stat *st);
+				    int cpu_map_idx, struct runtime_stat *st);
 struct perf_stat_output_ctx {
 	void *ctx;
 	print_metric_t print_metric;
@@ -253,5 +253,5 @@ void evlist__print_counters(struct evlist *evlist, struct perf_stat_config *conf
 			    struct target *_target, struct timespec *ts, int argc, const char **argv);
 
 struct metric_expr;
-double test_generic_metric(struct metric_expr *mexp, int cpu, struct runtime_stat *st);
+double test_generic_metric(struct metric_expr *mexp, int cpu_map_idx, struct runtime_stat *st);
 #endif
-- 
cgit v1.2.3


From 91802e73f77146d69afa0be7eafc983ec84b2bb0 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:46 -0800
Subject: libperf: Sync evsel documentation

cpu was renamed cpu_map_idx, for clarity.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-44-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/perf/Documentation/libperf.txt | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/perf/Documentation/libperf.txt b/tools/lib/perf/Documentation/libperf.txt
index faef9ba3a540..32c5051c24eb 100644
--- a/tools/lib/perf/Documentation/libperf.txt
+++ b/tools/lib/perf/Documentation/libperf.txt
@@ -136,16 +136,16 @@ SYNOPSIS
   int perf_evsel__open(struct perf_evsel *evsel, struct perf_cpu_map *cpus,
                        struct perf_thread_map *threads);
   void perf_evsel__close(struct perf_evsel *evsel);
-  void perf_evsel__close_cpu(struct perf_evsel *evsel, int cpu);
+  void perf_evsel__close_cpu(struct perf_evsel *evsel, int cpu_map_idx);
   int perf_evsel__mmap(struct perf_evsel *evsel, int pages);
   void perf_evsel__munmap(struct perf_evsel *evsel);
-  void *perf_evsel__mmap_base(struct perf_evsel *evsel, int cpu, int thread);
-  int perf_evsel__read(struct perf_evsel *evsel, int cpu, int thread,
+  void *perf_evsel__mmap_base(struct perf_evsel *evsel, int cpu_map_idx, int thread);
+  int perf_evsel__read(struct perf_evsel *evsel, int cpu_map_idx, int thread,
                        struct perf_counts_values *count);
   int perf_evsel__enable(struct perf_evsel *evsel);
-  int perf_evsel__enable_cpu(struct perf_evsel *evsel, int cpu);
+  int perf_evsel__enable_cpu(struct perf_evsel *evsel, int cpu_map_idx);
   int perf_evsel__disable(struct perf_evsel *evsel);
-  int perf_evsel__disable_cpu(struct perf_evsel *evsel, int cpu);
+  int perf_evsel__disable_cpu(struct perf_evsel *evsel, int cpu_map_idx);
   struct perf_cpu_map *perf_evsel__cpus(struct perf_evsel *evsel);
   struct perf_thread_map *perf_evsel__threads(struct perf_evsel *evsel);
   struct perf_event_attr *perf_evsel__attr(struct perf_evsel *evsel);
-- 
cgit v1.2.3


From 7263f3498ba8b6e65c1d810ccafec64cd61a6dc1 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:47 -0800
Subject: perf bpf: Rename 'cpu' to 'cpu_map_idx'

Synchronize the caller in evsel with the called function.

Shorten 3 lines of code in bperf_read by using
perf_cpu_map__for_each_cpu().

This code is frequently using variables named cpu as cpu map indices,
which doesn't matter as all CPUs are in the CPU map. It is strange in
some cases the cpumap is used at all.

Committer notes:

Found when building with BUILD_BPF_SKEL=1:

Remove unused 'num_cpu' variable in bperf__read().

Make 'j' an 'int' as it is used in perf_cpu_map__for_each_cpu() to compare against an 'int'

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-45-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf_counter.c | 19 +++++++++----------
 tools/perf/util/bpf_counter.h |  4 ++--
 2 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c
index c17d4a43ce06..80d1a3a31052 100644
--- a/tools/perf/util/bpf_counter.c
+++ b/tools/perf/util/bpf_counter.c
@@ -265,7 +265,7 @@ static int bpf_program_profiler__read(struct evsel *evsel)
 	return 0;
 }
 
-static int bpf_program_profiler__install_pe(struct evsel *evsel, int cpu,
+static int bpf_program_profiler__install_pe(struct evsel *evsel, int cpu_map_idx,
 					    int fd)
 {
 	struct bpf_prog_profiler_bpf *skel;
@@ -277,7 +277,7 @@ static int bpf_program_profiler__install_pe(struct evsel *evsel, int cpu,
 		assert(skel != NULL);
 
 		ret = bpf_map_update_elem(bpf_map__fd(skel->maps.events),
-					  &cpu, &fd, BPF_ANY);
+					  &cpu_map_idx, &fd, BPF_ANY);
 		if (ret)
 			return ret;
 	}
@@ -566,12 +566,12 @@ out:
 	return err;
 }
 
-static int bperf__install_pe(struct evsel *evsel, int cpu, int fd)
+static int bperf__install_pe(struct evsel *evsel, int cpu_map_idx, int fd)
 {
 	struct bperf_leader_bpf *skel = evsel->leader_skel;
 
 	return bpf_map_update_elem(bpf_map__fd(skel->maps.events),
-				   &cpu, &fd, BPF_ANY);
+				   &cpu_map_idx, &fd, BPF_ANY);
 }
 
 /*
@@ -608,7 +608,8 @@ static int bperf__read(struct evsel *evsel)
 	__u32 num_cpu_bpf = cpu__max_cpu();
 	struct bpf_perf_event_value values[num_cpu_bpf];
 	int reading_map_fd, err = 0;
-	__u32 i, j, num_cpu;
+	__u32 i;
+	int j;
 
 	bperf_sync_counters(evsel);
 	reading_map_fd = bpf_map__fd(skel->maps.accum_readings);
@@ -623,9 +624,7 @@ static int bperf__read(struct evsel *evsel)
 		case BPERF_FILTER_GLOBAL:
 			assert(i == 0);
 
-			num_cpu = all_cpu_map->nr;
-			for (j = 0; j < num_cpu; j++) {
-				cpu = all_cpu_map->map[j];
+			perf_cpu_map__for_each_cpu(cpu, j, all_cpu_map) {
 				perf_counts(evsel->counts, cpu, 0)->val = values[cpu].counter;
 				perf_counts(evsel->counts, cpu, 0)->ena = values[cpu].enabled;
 				perf_counts(evsel->counts, cpu, 0)->run = values[cpu].running;
@@ -757,11 +756,11 @@ static inline bool bpf_counter_skip(struct evsel *evsel)
 		evsel->follower_skel == NULL;
 }
 
-int bpf_counter__install_pe(struct evsel *evsel, int cpu, int fd)
+int bpf_counter__install_pe(struct evsel *evsel, int cpu_map_idx, int fd)
 {
 	if (bpf_counter_skip(evsel))
 		return 0;
-	return evsel->bpf_counter_ops->install_pe(evsel, cpu, fd);
+	return evsel->bpf_counter_ops->install_pe(evsel, cpu_map_idx, fd);
 }
 
 int bpf_counter__load(struct evsel *evsel, struct target *target)
diff --git a/tools/perf/util/bpf_counter.h b/tools/perf/util/bpf_counter.h
index 65ebaa6694fb..4dbf26408b69 100644
--- a/tools/perf/util/bpf_counter.h
+++ b/tools/perf/util/bpf_counter.h
@@ -16,7 +16,7 @@ typedef int (*bpf_counter_evsel_op)(struct evsel *evsel);
 typedef int (*bpf_counter_evsel_target_op)(struct evsel *evsel,
 					   struct target *target);
 typedef int (*bpf_counter_evsel_install_pe_op)(struct evsel *evsel,
-					       int cpu,
+					       int cpu_map_idx,
 					       int fd);
 
 struct bpf_counter_ops {
@@ -40,7 +40,7 @@ int bpf_counter__enable(struct evsel *evsel);
 int bpf_counter__disable(struct evsel *evsel);
 int bpf_counter__read(struct evsel *evsel);
 void bpf_counter__destroy(struct evsel *evsel);
-int bpf_counter__install_pe(struct evsel *evsel, int cpu, int fd);
+int bpf_counter__install_pe(struct evsel *evsel, int cpu_map_idx, int fd);
 
 #else /* HAVE_BPF_SKEL */
 
-- 
cgit v1.2.3


From 84d2f4f0375d4857f9f9e57a9ad75cbf0f34e108 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:48 -0800
Subject: perf c2c: Use more intention revealing iterator

Use perf_cpu_map__for_each_cpu() in setup_nodes.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-46-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-c2c.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c
index b5c67ef73862..ad1fbeafc93d 100644
--- a/tools/perf/builtin-c2c.c
+++ b/tools/perf/builtin-c2c.c
@@ -2015,7 +2015,7 @@ static int setup_nodes(struct perf_session *session)
 {
 	struct numa_node *n;
 	unsigned long **nodes;
-	int node, cpu;
+	int node, cpu, idx;
 	int *cpu2node;
 
 	if (c2c.node_info > 2)
@@ -2057,13 +2057,13 @@ static int setup_nodes(struct perf_session *session)
 		if (perf_cpu_map__empty(map))
 			continue;
 
-		for (cpu = 0; cpu < map->nr; cpu++) {
-			set_bit(map->map[cpu], set);
+		perf_cpu_map__for_each_cpu(cpu, idx, map) {
+			set_bit(cpu, set);
 
-			if (WARN_ONCE(cpu2node[map->map[cpu]] != -1, "node/cpu topology bug"))
+			if (WARN_ONCE(cpu2node[cpu] != -1, "node/cpu topology bug"))
 				return -EINVAL;
 
-			cpu2node[map->map[cpu]] = node;
+			cpu2node[cpu] = node;
 		}
 	}
 
-- 
cgit v1.2.3


From b57af1b4017abff969425dffd6f59ddfdedce8cb Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:49 -0800
Subject: perf script: Fix flipped index and cpu

perf_counts are accessed by the densely packed index.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-47-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-script.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index f40319144856..bb43529618b3 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -2131,7 +2131,7 @@ static void __process_stat(struct evsel *counter, u64 tstamp)
 		perf_cpu_map__for_each_cpu(cpu, idx, evsel__cpus(counter)) {
 			struct perf_counts_values *counts;
 
-			counts = perf_counts(counter->counts, cpu, thread);
+			counts = perf_counts(counter->counts, idx, thread);
 
 			printf("%3d %8d %15" PRIu64 " %15" PRIu64 " %15" PRIu64 " %15" PRIu64 " %s\n",
 				cpu,
-- 
cgit v1.2.3


From ce37ab3eb2490aba60ab1a622a4c6c6ee9a7cc66 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:50 -0800
Subject: perf stat: Correct first_shadow_cpu to return index

perf_stat__update_shadow_stats() and perf_stat__print_shadow_stats() use
a cpu map index rather than a CPU, but first_shadow_cpu is returning the
wrong value for this. Change first_shadow_cpu to
first_shadow_cpu_map_idx to make things agree.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-48-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/stat-display.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index 0f192360b6c6..ba95379efcfb 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -327,24 +327,23 @@ static void print_metric_header(struct perf_stat_config *config,
 		fprintf(os->fh, "%*s ", config->metric_only_len, unit);
 }
 
-static int first_shadow_cpu(struct perf_stat_config *config,
-			    struct evsel *evsel, const struct aggr_cpu_id *id)
+static int first_shadow_cpu_map_idx(struct perf_stat_config *config,
+				struct evsel *evsel, const struct aggr_cpu_id *id)
 {
-	struct perf_cpu_map *cpus;
+	struct perf_cpu_map *cpus = evsel__cpus(evsel);
 	int cpu, idx;
 
 	if (config->aggr_mode == AGGR_NONE)
-		return id->cpu;
+		return perf_cpu_map__idx(cpus, id->cpu);
 
 	if (!config->aggr_get_id)
 		return 0;
 
-	cpus = evsel__cpus(evsel);
 	perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
 		struct aggr_cpu_id cpu_id = config->aggr_get_id(config, cpu);
 
 		if (aggr_cpu_id__equal(&cpu_id, id))
-			return cpu;
+			return idx;
 	}
 	return 0;
 }
@@ -503,7 +502,7 @@ static void printout(struct perf_stat_config *config, struct aggr_cpu_id id, int
 	}
 
 	perf_stat__print_shadow_stats(config, counter, uval,
-				first_shadow_cpu(config, counter, &id),
+				first_shadow_cpu_map_idx(config, counter, &id),
 				&out, &config->metric_events, st);
 	if (!config->csv_output && !config->metric_only) {
 		print_noise(config, counter, noise);
@@ -532,7 +531,7 @@ static void aggr_update_shadow(struct perf_stat_config *config,
 				val += perf_counts(counter->counts, idx, 0)->val;
 			}
 			perf_stat__update_shadow_stats(counter, val,
-					first_shadow_cpu(config, counter, &id),
+					first_shadow_cpu_map_idx(config, counter, &id),
 					&rt_stat);
 		}
 	}
-- 
cgit v1.2.3


From 6d18804b963b78dcd53851f11e9080408b3d85c2 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 4 Jan 2022 22:13:51 -0800
Subject: perf cpumap: Give CPUs their own type

A common problem is confusing CPU map indices with the CPU, by wrapping
the CPU with a struct then this is avoided. This approach is similar to
atomic_t.

Committer notes:

To make it build with BUILD_BPF_SKEL=1 these files needed the
conversions to 'struct perf_cpu' usage:

  tools/perf/util/bpf_counter.c
  tools/perf/util/bpf_counter_cgroup.c
  tools/perf/util/bpf_ftrace.c

Also perf_env__get_cpu() was removed back in "perf cpumap: Switch
cpu_map__build_map to cpu function".

Additionally these needed to be fixed for the ARM builds to complete:

  tools/perf/arch/arm/util/cs-etm.c
  tools/perf/arch/arm64/util/pmu.c

Suggested-by: John Garry <john.garry@huawei.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Vineet Singh <vineet.singh@intel.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: zhengjun.xing@intel.com
Link: https://lore.kernel.org/r/20220105061351.120843-49-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/perf/cpumap.c                            | 103 ++++++++++++---------
 tools/lib/perf/evlist.c                            |   4 +-
 tools/lib/perf/evsel.c                             |   9 +-
 tools/lib/perf/include/internal/cpumap.h           |   9 +-
 tools/lib/perf/include/internal/evlist.h           |   3 +-
 tools/lib/perf/include/internal/evsel.h            |   4 +-
 tools/lib/perf/include/internal/mmap.h             |   5 +-
 tools/lib/perf/include/perf/cpumap.h               |   9 +-
 tools/lib/perf/mmap.c                              |   2 +-
 tools/perf/arch/arm/util/cs-etm.c                  |  35 ++++---
 tools/perf/arch/arm64/util/pmu.c                   |   2 +-
 tools/perf/bench/epoll-ctl.c                       |   2 +-
 tools/perf/bench/epoll-wait.c                      |   2 +-
 tools/perf/bench/futex-hash.c                      |   2 +-
 tools/perf/bench/futex-lock-pi.c                   |   2 +-
 tools/perf/bench/futex-requeue.c                   |   2 +-
 tools/perf/bench/futex-wake-parallel.c             |   2 +-
 tools/perf/bench/futex-wake.c                      |   2 +-
 tools/perf/builtin-c2c.c                           |  13 +--
 tools/perf/builtin-ftrace.c                        |   2 +-
 tools/perf/builtin-kmem.c                          |   2 +-
 tools/perf/builtin-record.c                        |   2 +-
 tools/perf/builtin-sched.c                         |  65 +++++++------
 tools/perf/builtin-script.c                        |   5 +-
 tools/perf/builtin-stat.c                          |  67 +++++++-------
 tools/perf/tests/attr.c                            |   6 +-
 tools/perf/tests/bitmap.c                          |   2 +-
 tools/perf/tests/cpumap.c                          |   6 +-
 tools/perf/tests/event_update.c                    |   6 +-
 tools/perf/tests/mem2node.c                        |   2 +-
 tools/perf/tests/mmap-basic.c                      |   4 +-
 tools/perf/tests/openat-syscall-all-cpus.c         |  17 ++--
 tools/perf/tests/stat.c                            |   3 +-
 tools/perf/tests/topology.c                        |  30 +++---
 tools/perf/util/affinity.c                         |   2 +-
 tools/perf/util/auxtrace.c                         |  12 +--
 tools/perf/util/auxtrace.h                         |   5 +-
 tools/perf/util/bpf_counter.c                      |  12 ++-
 tools/perf/util/bpf_counter_cgroup.c               |  10 +-
 tools/perf/util/bpf_ftrace.c                       |   4 +-
 tools/perf/util/cpumap.c                           |  91 +++++++++---------
 tools/perf/util/cpumap.h                           |  26 +++---
 tools/perf/util/cputopo.c                          |   6 +-
 tools/perf/util/env.c                              |  29 +++---
 tools/perf/util/env.h                              |   3 +-
 tools/perf/util/evlist.c                           |   8 +-
 tools/perf/util/evlist.h                           |   2 +-
 tools/perf/util/evsel.c                            |   6 +-
 tools/perf/util/expr.c                             |   2 +-
 tools/perf/util/header.c                           |   6 +-
 tools/perf/util/mmap.c                             |  19 ++--
 tools/perf/util/mmap.h                             |   3 +-
 tools/perf/util/perf_api_probe.c                   |  15 +--
 tools/perf/util/python.c                           |   4 +-
 tools/perf/util/record.c                           |  11 ++-
 .../util/scripting-engines/trace-event-python.c    |   6 +-
 tools/perf/util/session.c                          |  10 +-
 tools/perf/util/stat-display.c                     |  34 ++++---
 tools/perf/util/stat.c                             |   2 +-
 tools/perf/util/stat.h                             |   2 +-
 tools/perf/util/svghelper.c                        |   6 +-
 tools/perf/util/synthetic-events.c                 |  12 +--
 tools/perf/util/synthetic-events.h                 |   3 +-
 tools/perf/util/util.h                             |   5 +-
 64 files changed, 431 insertions(+), 356 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/perf/cpumap.c b/tools/lib/perf/cpumap.c
index eacea3ab965a..ee66760f1e63 100644
--- a/tools/lib/perf/cpumap.c
+++ b/tools/lib/perf/cpumap.c
@@ -10,15 +10,24 @@
 #include <ctype.h>
 #include <limits.h>
 
-struct perf_cpu_map *perf_cpu_map__dummy_new(void)
+static struct perf_cpu_map *perf_cpu_map__alloc(int nr_cpus)
 {
-	struct perf_cpu_map *cpus = malloc(sizeof(*cpus) + sizeof(int));
+	struct perf_cpu_map *cpus = malloc(sizeof(*cpus) + sizeof(struct perf_cpu) * nr_cpus);
 
 	if (cpus != NULL) {
-		cpus->nr = 1;
-		cpus->map[0] = -1;
+		cpus->nr = nr_cpus;
 		refcount_set(&cpus->refcnt, 1);
+
 	}
+	return cpus;
+}
+
+struct perf_cpu_map *perf_cpu_map__dummy_new(void)
+{
+	struct perf_cpu_map *cpus = perf_cpu_map__alloc(1);
+
+	if (cpus)
+		cpus->map[0].cpu = -1;
 
 	return cpus;
 }
@@ -54,15 +63,12 @@ static struct perf_cpu_map *cpu_map__default_new(void)
 	if (nr_cpus < 0)
 		return NULL;
 
-	cpus = malloc(sizeof(*cpus) + nr_cpus * sizeof(int));
+	cpus = perf_cpu_map__alloc(nr_cpus);
 	if (cpus != NULL) {
 		int i;
 
 		for (i = 0; i < nr_cpus; ++i)
-			cpus->map[i] = i;
-
-		cpus->nr = nr_cpus;
-		refcount_set(&cpus->refcnt, 1);
+			cpus->map[i].cpu = i;
 	}
 
 	return cpus;
@@ -73,31 +79,32 @@ struct perf_cpu_map *perf_cpu_map__default_new(void)
 	return cpu_map__default_new();
 }
 
-static int cmp_int(const void *a, const void *b)
+
+static int cmp_cpu(const void *a, const void *b)
 {
-	return *(const int *)a - *(const int*)b;
+	const struct perf_cpu *cpu_a = a, *cpu_b = b;
+
+	return cpu_a->cpu - cpu_b->cpu;
 }
 
-static struct perf_cpu_map *cpu_map__trim_new(int nr_cpus, int *tmp_cpus)
+static struct perf_cpu_map *cpu_map__trim_new(int nr_cpus, const struct perf_cpu *tmp_cpus)
 {
-	size_t payload_size = nr_cpus * sizeof(int);
-	struct perf_cpu_map *cpus = malloc(sizeof(*cpus) + payload_size);
+	size_t payload_size = nr_cpus * sizeof(struct perf_cpu);
+	struct perf_cpu_map *cpus = perf_cpu_map__alloc(nr_cpus);
 	int i, j;
 
 	if (cpus != NULL) {
 		memcpy(cpus->map, tmp_cpus, payload_size);
-		qsort(cpus->map, nr_cpus, sizeof(int), cmp_int);
+		qsort(cpus->map, nr_cpus, sizeof(struct perf_cpu), cmp_cpu);
 		/* Remove dups */
 		j = 0;
 		for (i = 0; i < nr_cpus; i++) {
-			if (i == 0 || cpus->map[i] != cpus->map[i - 1])
-				cpus->map[j++] = cpus->map[i];
+			if (i == 0 || cpus->map[i].cpu != cpus->map[i - 1].cpu)
+				cpus->map[j++].cpu = cpus->map[i].cpu;
 		}
 		cpus->nr = j;
 		assert(j <= nr_cpus);
-		refcount_set(&cpus->refcnt, 1);
 	}
-
 	return cpus;
 }
 
@@ -105,7 +112,7 @@ struct perf_cpu_map *perf_cpu_map__read(FILE *file)
 {
 	struct perf_cpu_map *cpus = NULL;
 	int nr_cpus = 0;
-	int *tmp_cpus = NULL, *tmp;
+	struct perf_cpu *tmp_cpus = NULL, *tmp;
 	int max_entries = 0;
 	int n, cpu, prev;
 	char sep;
@@ -124,24 +131,24 @@ struct perf_cpu_map *perf_cpu_map__read(FILE *file)
 
 			if (new_max >= max_entries) {
 				max_entries = new_max + MAX_NR_CPUS / 2;
-				tmp = realloc(tmp_cpus, max_entries * sizeof(int));
+				tmp = realloc(tmp_cpus, max_entries * sizeof(struct perf_cpu));
 				if (tmp == NULL)
 					goto out_free_tmp;
 				tmp_cpus = tmp;
 			}
 
 			while (++prev < cpu)
-				tmp_cpus[nr_cpus++] = prev;
+				tmp_cpus[nr_cpus++].cpu = prev;
 		}
 		if (nr_cpus == max_entries) {
 			max_entries += MAX_NR_CPUS;
-			tmp = realloc(tmp_cpus, max_entries * sizeof(int));
+			tmp = realloc(tmp_cpus, max_entries * sizeof(struct perf_cpu));
 			if (tmp == NULL)
 				goto out_free_tmp;
 			tmp_cpus = tmp;
 		}
 
-		tmp_cpus[nr_cpus++] = cpu;
+		tmp_cpus[nr_cpus++].cpu = cpu;
 		if (n == 2 && sep == '-')
 			prev = cpu;
 		else
@@ -179,7 +186,7 @@ struct perf_cpu_map *perf_cpu_map__new(const char *cpu_list)
 	unsigned long start_cpu, end_cpu = 0;
 	char *p = NULL;
 	int i, nr_cpus = 0;
-	int *tmp_cpus = NULL, *tmp;
+	struct perf_cpu *tmp_cpus = NULL, *tmp;
 	int max_entries = 0;
 
 	if (!cpu_list)
@@ -220,17 +227,17 @@ struct perf_cpu_map *perf_cpu_map__new(const char *cpu_list)
 		for (; start_cpu <= end_cpu; start_cpu++) {
 			/* check for duplicates */
 			for (i = 0; i < nr_cpus; i++)
-				if (tmp_cpus[i] == (int)start_cpu)
+				if (tmp_cpus[i].cpu == (int)start_cpu)
 					goto invalid;
 
 			if (nr_cpus == max_entries) {
 				max_entries += MAX_NR_CPUS;
-				tmp = realloc(tmp_cpus, max_entries * sizeof(int));
+				tmp = realloc(tmp_cpus, max_entries * sizeof(struct perf_cpu));
 				if (tmp == NULL)
 					goto invalid;
 				tmp_cpus = tmp;
 			}
-			tmp_cpus[nr_cpus++] = (int)start_cpu;
+			tmp_cpus[nr_cpus++].cpu = (int)start_cpu;
 		}
 		if (*p)
 			++p;
@@ -250,12 +257,16 @@ out:
 	return cpus;
 }
 
-int perf_cpu_map__cpu(const struct perf_cpu_map *cpus, int idx)
+struct perf_cpu perf_cpu_map__cpu(const struct perf_cpu_map *cpus, int idx)
 {
+	struct perf_cpu result = {
+		.cpu = -1
+	};
+
 	if (cpus && idx < cpus->nr)
 		return cpus->map[idx];
 
-	return -1;
+	return result;
 }
 
 int perf_cpu_map__nr(const struct perf_cpu_map *cpus)
@@ -265,10 +276,10 @@ int perf_cpu_map__nr(const struct perf_cpu_map *cpus)
 
 bool perf_cpu_map__empty(const struct perf_cpu_map *map)
 {
-	return map ? map->map[0] == -1 : true;
+	return map ? map->map[0].cpu == -1 : true;
 }
 
-int perf_cpu_map__idx(const struct perf_cpu_map *cpus, int cpu)
+int perf_cpu_map__idx(const struct perf_cpu_map *cpus, struct perf_cpu cpu)
 {
 	int low, high;
 
@@ -278,13 +289,13 @@ int perf_cpu_map__idx(const struct perf_cpu_map *cpus, int cpu)
 	low = 0;
 	high = cpus->nr;
 	while (low < high) {
-		int idx = (low + high) / 2,
-		    cpu_at_idx = cpus->map[idx];
+		int idx = (low + high) / 2;
+		struct perf_cpu cpu_at_idx = cpus->map[idx];
 
-		if (cpu_at_idx == cpu)
+		if (cpu_at_idx.cpu == cpu.cpu)
 			return idx;
 
-		if (cpu_at_idx > cpu)
+		if (cpu_at_idx.cpu > cpu.cpu)
 			high = idx;
 		else
 			low = idx + 1;
@@ -293,15 +304,19 @@ int perf_cpu_map__idx(const struct perf_cpu_map *cpus, int cpu)
 	return -1;
 }
 
-bool perf_cpu_map__has(const struct perf_cpu_map *cpus, int cpu)
+bool perf_cpu_map__has(const struct perf_cpu_map *cpus, struct perf_cpu cpu)
 {
 	return perf_cpu_map__idx(cpus, cpu) != -1;
 }
 
-int perf_cpu_map__max(struct perf_cpu_map *map)
+struct perf_cpu perf_cpu_map__max(struct perf_cpu_map *map)
 {
+	struct perf_cpu result = {
+		.cpu = -1
+	};
+
 	// cpu_map__trim_new() qsort()s it, cpu_map__default_new() sorts it as well.
-	return map->nr > 0 ? map->map[map->nr - 1] : -1;
+	return map->nr > 0 ? map->map[map->nr - 1] : result;
 }
 
 /*
@@ -315,7 +330,7 @@ int perf_cpu_map__max(struct perf_cpu_map *map)
 struct perf_cpu_map *perf_cpu_map__merge(struct perf_cpu_map *orig,
 					 struct perf_cpu_map *other)
 {
-	int *tmp_cpus;
+	struct perf_cpu *tmp_cpus;
 	int tmp_len;
 	int i, j, k;
 	struct perf_cpu_map *merged;
@@ -329,19 +344,19 @@ struct perf_cpu_map *perf_cpu_map__merge(struct perf_cpu_map *orig,
 	if (!other)
 		return orig;
 	if (orig->nr == other->nr &&
-	    !memcmp(orig->map, other->map, orig->nr * sizeof(int)))
+	    !memcmp(orig->map, other->map, orig->nr * sizeof(struct perf_cpu)))
 		return orig;
 
 	tmp_len = orig->nr + other->nr;
-	tmp_cpus = malloc(tmp_len * sizeof(int));
+	tmp_cpus = malloc(tmp_len * sizeof(struct perf_cpu));
 	if (!tmp_cpus)
 		return NULL;
 
 	/* Standard merge algorithm from wikipedia */
 	i = j = k = 0;
 	while (i < orig->nr && j < other->nr) {
-		if (orig->map[i] <= other->map[j]) {
-			if (orig->map[i] == other->map[j])
+		if (orig->map[i].cpu <= other->map[j].cpu) {
+			if (orig->map[i].cpu == other->map[j].cpu)
 				j++;
 			tmp_cpus[k++] = orig->map[i++];
 		} else
diff --git a/tools/lib/perf/evlist.c b/tools/lib/perf/evlist.c
index 245acbc53bd3..9a770bfdc804 100644
--- a/tools/lib/perf/evlist.c
+++ b/tools/lib/perf/evlist.c
@@ -407,7 +407,7 @@ perf_evlist__mmap_cb_get(struct perf_evlist *evlist, bool overwrite, int idx)
 
 static int
 perf_evlist__mmap_cb_mmap(struct perf_mmap *map, struct perf_mmap_param *mp,
-			  int output, int cpu)
+			  int output, struct perf_cpu cpu)
 {
 	return perf_mmap__mmap(map, mp, output, cpu);
 }
@@ -426,7 +426,7 @@ mmap_per_evsel(struct perf_evlist *evlist, struct perf_evlist_mmap_ops *ops,
 	       int idx, struct perf_mmap_param *mp, int cpu_idx,
 	       int thread, int *_output, int *_output_overwrite)
 {
-	int evlist_cpu = perf_cpu_map__cpu(evlist->cpus, cpu_idx);
+	struct perf_cpu evlist_cpu = perf_cpu_map__cpu(evlist->cpus, cpu_idx);
 	struct perf_evsel *evsel;
 	int revent;
 
diff --git a/tools/lib/perf/evsel.c b/tools/lib/perf/evsel.c
index f1e1665ef4bd..7ea86a44eae5 100644
--- a/tools/lib/perf/evsel.c
+++ b/tools/lib/perf/evsel.c
@@ -78,10 +78,10 @@ static int perf_evsel__alloc_mmap(struct perf_evsel *evsel, int ncpus, int nthre
 
 static int
 sys_perf_event_open(struct perf_event_attr *attr,
-		    pid_t pid, int cpu, int group_fd,
+		    pid_t pid, struct perf_cpu cpu, int group_fd,
 		    unsigned long flags)
 {
-	return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags);
+	return syscall(__NR_perf_event_open, attr, pid, cpu.cpu, group_fd, flags);
 }
 
 static int get_group_fd(struct perf_evsel *evsel, int cpu_map_idx, int thread, int *group_fd)
@@ -113,7 +113,8 @@ static int get_group_fd(struct perf_evsel *evsel, int cpu_map_idx, int thread, i
 int perf_evsel__open(struct perf_evsel *evsel, struct perf_cpu_map *cpus,
 		     struct perf_thread_map *threads)
 {
-	int cpu, idx, thread, err = 0;
+	struct perf_cpu cpu;
+	int idx, thread, err = 0;
 
 	if (cpus == NULL) {
 		static struct perf_cpu_map *empty_cpu_map;
@@ -252,7 +253,7 @@ int perf_evsel__mmap(struct perf_evsel *evsel, int pages)
 		for (thread = 0; thread < xyarray__max_y(evsel->fd); thread++) {
 			int *fd = FD(evsel, idx, thread);
 			struct perf_mmap *map;
-			int cpu = perf_cpu_map__cpu(evsel->cpus, idx);
+			struct perf_cpu cpu = perf_cpu_map__cpu(evsel->cpus, idx);
 
 			if (fd == NULL || *fd < 0)
 				continue;
diff --git a/tools/lib/perf/include/internal/cpumap.h b/tools/lib/perf/include/internal/cpumap.h
index 71a31ed738c9..581f9ffb4237 100644
--- a/tools/lib/perf/include/internal/cpumap.h
+++ b/tools/lib/perf/include/internal/cpumap.h
@@ -4,6 +4,11 @@
 
 #include <linux/refcount.h>
 
+/** A wrapper around a CPU to avoid confusion with the perf_cpu_map's map's indices. */
+struct perf_cpu {
+	int cpu;
+};
+
 /**
  * A sized, reference counted, sorted array of integers representing CPU
  * numbers. This is commonly used to capture which CPUs a PMU is associated
@@ -16,13 +21,13 @@ struct perf_cpu_map {
 	/** Length of the map array. */
 	int		nr;
 	/** The CPU values. */
-	int		map[];
+	struct perf_cpu	map[];
 };
 
 #ifndef MAX_NR_CPUS
 #define MAX_NR_CPUS	2048
 #endif
 
-int perf_cpu_map__idx(const struct perf_cpu_map *cpus, int cpu);
+int perf_cpu_map__idx(const struct perf_cpu_map *cpus, struct perf_cpu cpu);
 
 #endif /* __LIBPERF_INTERNAL_CPUMAP_H */
diff --git a/tools/lib/perf/include/internal/evlist.h b/tools/lib/perf/include/internal/evlist.h
index 6f74269a3ad4..4cefade540bd 100644
--- a/tools/lib/perf/include/internal/evlist.h
+++ b/tools/lib/perf/include/internal/evlist.h
@@ -4,6 +4,7 @@
 
 #include <linux/list.h>
 #include <api/fd/array.h>
+#include <internal/cpumap.h>
 #include <internal/evsel.h>
 
 #define PERF_EVLIST__HLIST_BITS 8
@@ -36,7 +37,7 @@ typedef void
 typedef struct perf_mmap*
 (*perf_evlist_mmap__cb_get_t)(struct perf_evlist*, bool, int);
 typedef int
-(*perf_evlist_mmap__cb_mmap_t)(struct perf_mmap*, struct perf_mmap_param*, int, int);
+(*perf_evlist_mmap__cb_mmap_t)(struct perf_mmap*, struct perf_mmap_param*, int, struct perf_cpu);
 
 struct perf_evlist_mmap_ops {
 	perf_evlist_mmap__cb_idx_t	idx;
diff --git a/tools/lib/perf/include/internal/evsel.h b/tools/lib/perf/include/internal/evsel.h
index 1f3eacbad2e8..cfc9ebd7968e 100644
--- a/tools/lib/perf/include/internal/evsel.h
+++ b/tools/lib/perf/include/internal/evsel.h
@@ -6,8 +6,8 @@
 #include <linux/perf_event.h>
 #include <stdbool.h>
 #include <sys/types.h>
+#include <internal/cpumap.h>
 
-struct perf_cpu_map;
 struct perf_thread_map;
 struct xyarray;
 
@@ -27,7 +27,7 @@ struct perf_sample_id {
 	* queue number.
 	*/
 	int			 idx;
-	int			 cpu;
+	struct perf_cpu		 cpu;
 	pid_t			 tid;
 
 	/* Holds total ID period value for PERF_SAMPLE_READ processing. */
diff --git a/tools/lib/perf/include/internal/mmap.h b/tools/lib/perf/include/internal/mmap.h
index 5e3422f40ed5..5a062af8e9d8 100644
--- a/tools/lib/perf/include/internal/mmap.h
+++ b/tools/lib/perf/include/internal/mmap.h
@@ -6,6 +6,7 @@
 #include <linux/refcount.h>
 #include <linux/types.h>
 #include <stdbool.h>
+#include <internal/cpumap.h>
 
 /* perf sample has 16 bits size limit */
 #define PERF_SAMPLE_MAX_SIZE (1 << 16)
@@ -24,7 +25,7 @@ struct perf_mmap {
 	void			*base;
 	int			 mask;
 	int			 fd;
-	int			 cpu;
+	struct perf_cpu		 cpu;
 	refcount_t		 refcnt;
 	u64			 prev;
 	u64			 start;
@@ -46,7 +47,7 @@ size_t perf_mmap__mmap_len(struct perf_mmap *map);
 void perf_mmap__init(struct perf_mmap *map, struct perf_mmap *prev,
 		     bool overwrite, libperf_unmap_cb_t unmap_cb);
 int perf_mmap__mmap(struct perf_mmap *map, struct perf_mmap_param *mp,
-		    int fd, int cpu);
+		    int fd, struct perf_cpu cpu);
 void perf_mmap__munmap(struct perf_mmap *map);
 void perf_mmap__get(struct perf_mmap *map);
 void perf_mmap__put(struct perf_mmap *map);
diff --git a/tools/lib/perf/include/perf/cpumap.h b/tools/lib/perf/include/perf/cpumap.h
index 3f1c0afa3ccd..15b8faafd615 100644
--- a/tools/lib/perf/include/perf/cpumap.h
+++ b/tools/lib/perf/include/perf/cpumap.h
@@ -3,11 +3,10 @@
 #define __LIBPERF_CPUMAP_H
 
 #include <perf/core.h>
+#include <perf/cpumap.h>
 #include <stdio.h>
 #include <stdbool.h>
 
-struct perf_cpu_map;
-
 LIBPERF_API struct perf_cpu_map *perf_cpu_map__dummy_new(void);
 LIBPERF_API struct perf_cpu_map *perf_cpu_map__default_new(void);
 LIBPERF_API struct perf_cpu_map *perf_cpu_map__new(const char *cpu_list);
@@ -16,11 +15,11 @@ LIBPERF_API struct perf_cpu_map *perf_cpu_map__get(struct perf_cpu_map *map);
 LIBPERF_API struct perf_cpu_map *perf_cpu_map__merge(struct perf_cpu_map *orig,
 						     struct perf_cpu_map *other);
 LIBPERF_API void perf_cpu_map__put(struct perf_cpu_map *map);
-LIBPERF_API int perf_cpu_map__cpu(const struct perf_cpu_map *cpus, int idx);
+LIBPERF_API struct perf_cpu perf_cpu_map__cpu(const struct perf_cpu_map *cpus, int idx);
 LIBPERF_API int perf_cpu_map__nr(const struct perf_cpu_map *cpus);
 LIBPERF_API bool perf_cpu_map__empty(const struct perf_cpu_map *map);
-LIBPERF_API int perf_cpu_map__max(struct perf_cpu_map *map);
-LIBPERF_API bool perf_cpu_map__has(const struct perf_cpu_map *map, int cpu);
+LIBPERF_API struct perf_cpu perf_cpu_map__max(struct perf_cpu_map *map);
+LIBPERF_API bool perf_cpu_map__has(const struct perf_cpu_map *map, struct perf_cpu cpu);
 
 #define perf_cpu_map__for_each_cpu(cpu, idx, cpus)		\
 	for ((idx) = 0, (cpu) = perf_cpu_map__cpu(cpus, idx);	\
diff --git a/tools/lib/perf/mmap.c b/tools/lib/perf/mmap.c
index aaa457904008..f7ee07cb5818 100644
--- a/tools/lib/perf/mmap.c
+++ b/tools/lib/perf/mmap.c
@@ -32,7 +32,7 @@ size_t perf_mmap__mmap_len(struct perf_mmap *map)
 }
 
 int perf_mmap__mmap(struct perf_mmap *map, struct perf_mmap_param *mp,
-		    int fd, int cpu)
+		    int fd, struct perf_cpu cpu)
 {
 	map->prev = 0;
 	map->mask = mp->mask;
diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c
index 129c0272d65b..2e8b2c4365a0 100644
--- a/tools/perf/arch/arm/util/cs-etm.c
+++ b/tools/perf/arch/arm/util/cs-etm.c
@@ -203,9 +203,11 @@ static int cs_etm_set_option(struct auxtrace_record *itr,
 	struct perf_cpu_map *online_cpus = perf_cpu_map__new(NULL);
 
 	/* Set option of each CPU we have */
-	for (i = 0; i < cpu__max_cpu(); i++) {
-		if (!perf_cpu_map__has(event_cpus, i) ||
-		    !perf_cpu_map__has(online_cpus, i))
+	for (i = 0; i < cpu__max_cpu().cpu; i++) {
+		struct perf_cpu cpu = { .cpu = i, };
+
+		if (!perf_cpu_map__has(event_cpus, cpu) ||
+		    !perf_cpu_map__has(online_cpus, cpu))
 			continue;
 
 		if (option & BIT(ETM_OPT_CTXTID)) {
@@ -522,9 +524,11 @@ cs_etm_info_priv_size(struct auxtrace_record *itr __maybe_unused,
 
 	/* cpu map is not empty, we have specific CPUs to work with */
 	if (!perf_cpu_map__empty(event_cpus)) {
-		for (i = 0; i < cpu__max_cpu(); i++) {
-			if (!perf_cpu_map__has(event_cpus, i) ||
-			    !perf_cpu_map__has(online_cpus, i))
+		for (i = 0; i < cpu__max_cpu().cpu; i++) {
+			struct perf_cpu cpu = { .cpu = i, };
+
+			if (!perf_cpu_map__has(event_cpus, cpu) ||
+			    !perf_cpu_map__has(online_cpus, cpu))
 				continue;
 
 			if (cs_etm_is_ete(itr, i))
@@ -536,8 +540,10 @@ cs_etm_info_priv_size(struct auxtrace_record *itr __maybe_unused,
 		}
 	} else {
 		/* get configuration for all CPUs in the system */
-		for (i = 0; i < cpu__max_cpu(); i++) {
-			if (!perf_cpu_map__has(online_cpus, i))
+		for (i = 0; i < cpu__max_cpu().cpu; i++) {
+			struct perf_cpu cpu = { .cpu = i, };
+
+			if (!perf_cpu_map__has(online_cpus, cpu))
 				continue;
 
 			if (cs_etm_is_ete(itr, i))
@@ -722,8 +728,10 @@ static int cs_etm_info_fill(struct auxtrace_record *itr,
 	} else {
 		/* Make sure all specified CPUs are online */
 		for (i = 0; i < perf_cpu_map__nr(event_cpus); i++) {
-			if (perf_cpu_map__has(event_cpus, i) &&
-			    !perf_cpu_map__has(online_cpus, i))
+			struct perf_cpu cpu = { .cpu = i, };
+
+			if (perf_cpu_map__has(event_cpus, cpu) &&
+			    !perf_cpu_map__has(online_cpus, cpu))
 				return -EINVAL;
 		}
 
@@ -743,9 +751,12 @@ static int cs_etm_info_fill(struct auxtrace_record *itr,
 
 	offset = CS_ETM_SNAPSHOT + 1;
 
-	for (i = 0; i < cpu__max_cpu() && offset < priv_size; i++)
-		if (perf_cpu_map__has(cpu_map, i))
+	for (i = 0; i < cpu__max_cpu().cpu && offset < priv_size; i++) {
+		struct perf_cpu cpu = { .cpu = i, };
+
+		if (perf_cpu_map__has(cpu_map, cpu))
 			cs_etm_get_metadata(i, &offset, itr, info);
+	}
 
 	perf_cpu_map__put(online_cpus);
 
diff --git a/tools/perf/arch/arm64/util/pmu.c b/tools/perf/arch/arm64/util/pmu.c
index d3a18f9c85f6..79124bba713e 100644
--- a/tools/perf/arch/arm64/util/pmu.c
+++ b/tools/perf/arch/arm64/util/pmu.c
@@ -15,7 +15,7 @@ const struct pmu_events_map *pmu_events_map__find(void)
 		 * The cpumap should cover all CPUs. Otherwise, some CPUs may
 		 * not support some events or have different event IDs.
 		 */
-		if (pmu->cpus->nr != cpu__max_cpu())
+		if (pmu->cpus->nr != cpu__max_cpu().cpu)
 			return NULL;
 
 		return perf_pmu__find_map(pmu);
diff --git a/tools/perf/bench/epoll-ctl.c b/tools/perf/bench/epoll-ctl.c
index ddaca75c3bc0..1a17ec83d3c4 100644
--- a/tools/perf/bench/epoll-ctl.c
+++ b/tools/perf/bench/epoll-ctl.c
@@ -253,7 +253,7 @@ static int do_threads(struct worker *worker, struct perf_cpu_map *cpu)
 
 		if (!noaffinity) {
 			CPU_ZERO(&cpuset);
-			CPU_SET(cpu->map[i % cpu->nr], &cpuset);
+			CPU_SET(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu, &cpuset);
 
 			ret = pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpuset);
 			if (ret)
diff --git a/tools/perf/bench/epoll-wait.c b/tools/perf/bench/epoll-wait.c
index 79d13dbc0a47..0d1dd8879197 100644
--- a/tools/perf/bench/epoll-wait.c
+++ b/tools/perf/bench/epoll-wait.c
@@ -342,7 +342,7 @@ static int do_threads(struct worker *worker, struct perf_cpu_map *cpu)
 
 		if (!noaffinity) {
 			CPU_ZERO(&cpuset);
-			CPU_SET(cpu->map[i % cpu->nr], &cpuset);
+			CPU_SET(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu, &cpuset);
 
 			ret = pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpuset);
 			if (ret)
diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c
index fcdea3e44937..9627b6ab8670 100644
--- a/tools/perf/bench/futex-hash.c
+++ b/tools/perf/bench/futex-hash.c
@@ -177,7 +177,7 @@ int bench_futex_hash(int argc, const char **argv)
 			goto errmem;
 
 		CPU_ZERO(&cpuset);
-		CPU_SET(cpu->map[i % cpu->nr], &cpuset);
+		CPU_SET(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu, &cpuset);
 
 		ret = pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpuset);
 		if (ret)
diff --git a/tools/perf/bench/futex-lock-pi.c b/tools/perf/bench/futex-lock-pi.c
index 137890f78e17..a512a320df74 100644
--- a/tools/perf/bench/futex-lock-pi.c
+++ b/tools/perf/bench/futex-lock-pi.c
@@ -136,7 +136,7 @@ static void create_threads(struct worker *w, pthread_attr_t thread_attr,
 			worker[i].futex = &global_futex;
 
 		CPU_ZERO(&cpuset);
-		CPU_SET(cpu->map[i % cpu->nr], &cpuset);
+		CPU_SET(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu, &cpuset);
 
 		if (pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpuset))
 			err(EXIT_FAILURE, "pthread_attr_setaffinity_np");
diff --git a/tools/perf/bench/futex-requeue.c b/tools/perf/bench/futex-requeue.c
index f7a5ffebb940..aca47ce8b1e7 100644
--- a/tools/perf/bench/futex-requeue.c
+++ b/tools/perf/bench/futex-requeue.c
@@ -131,7 +131,7 @@ static void block_threads(pthread_t *w,
 	/* create and block all threads */
 	for (i = 0; i < params.nthreads; i++) {
 		CPU_ZERO(&cpuset);
-		CPU_SET(cpu->map[i % cpu->nr], &cpuset);
+		CPU_SET(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu, &cpuset);
 
 		if (pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpuset))
 			err(EXIT_FAILURE, "pthread_attr_setaffinity_np");
diff --git a/tools/perf/bench/futex-wake-parallel.c b/tools/perf/bench/futex-wake-parallel.c
index 0983f40b4b40..888ee6037945 100644
--- a/tools/perf/bench/futex-wake-parallel.c
+++ b/tools/perf/bench/futex-wake-parallel.c
@@ -152,7 +152,7 @@ static void block_threads(pthread_t *w, pthread_attr_t thread_attr,
 	/* create and block all threads */
 	for (i = 0; i < params.nthreads; i++) {
 		CPU_ZERO(&cpuset);
-		CPU_SET(cpu->map[i % cpu->nr], &cpuset);
+		CPU_SET(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu, &cpuset);
 
 		if (pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpuset))
 			err(EXIT_FAILURE, "pthread_attr_setaffinity_np");
diff --git a/tools/perf/bench/futex-wake.c b/tools/perf/bench/futex-wake.c
index 2226a475e782..aa82db51c0ab 100644
--- a/tools/perf/bench/futex-wake.c
+++ b/tools/perf/bench/futex-wake.c
@@ -105,7 +105,7 @@ static void block_threads(pthread_t *w,
 	/* create and block all threads */
 	for (i = 0; i < params.nthreads; i++) {
 		CPU_ZERO(&cpuset);
-		CPU_SET(cpu->map[i % cpu->nr], &cpuset);
+		CPU_SET(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu, &cpuset);
 
 		if (pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpuset))
 			err(EXIT_FAILURE, "pthread_attr_setaffinity_np");
diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c
index ad1fbeafc93d..77dd4afacca4 100644
--- a/tools/perf/builtin-c2c.c
+++ b/tools/perf/builtin-c2c.c
@@ -2015,7 +2015,8 @@ static int setup_nodes(struct perf_session *session)
 {
 	struct numa_node *n;
 	unsigned long **nodes;
-	int node, cpu, idx;
+	int node, idx;
+	struct perf_cpu cpu;
 	int *cpu2node;
 
 	if (c2c.node_info > 2)
@@ -2038,8 +2039,8 @@ static int setup_nodes(struct perf_session *session)
 	if (!cpu2node)
 		return -ENOMEM;
 
-	for (cpu = 0; cpu < c2c.cpus_cnt; cpu++)
-		cpu2node[cpu] = -1;
+	for (idx = 0; idx < c2c.cpus_cnt; idx++)
+		cpu2node[idx] = -1;
 
 	c2c.cpu2node = cpu2node;
 
@@ -2058,12 +2059,12 @@ static int setup_nodes(struct perf_session *session)
 			continue;
 
 		perf_cpu_map__for_each_cpu(cpu, idx, map) {
-			set_bit(cpu, set);
+			set_bit(cpu.cpu, set);
 
-			if (WARN_ONCE(cpu2node[cpu] != -1, "node/cpu topology bug"))
+			if (WARN_ONCE(cpu2node[cpu.cpu] != -1, "node/cpu topology bug"))
 				return -EINVAL;
 
-			cpu2node[cpu] = node;
+			cpu2node[cpu.cpu] = node;
 		}
 	}
 
diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c
index f16c39a37a52..71452599f87d 100644
--- a/tools/perf/builtin-ftrace.c
+++ b/tools/perf/builtin-ftrace.c
@@ -281,7 +281,7 @@ static int set_tracing_cpumask(struct perf_cpu_map *cpumap)
 	int ret;
 	int last_cpu;
 
-	last_cpu = perf_cpu_map__cpu(cpumap, cpumap->nr - 1);
+	last_cpu = perf_cpu_map__cpu(cpumap, cpumap->nr - 1).cpu;
 	mask_size = last_cpu / 4 + 2; /* one more byte for EOS */
 	mask_size += last_cpu / 32; /* ',' is needed for every 32th cpus */
 
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index da03a341c63c..99d7ff9a8eff 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -192,7 +192,7 @@ static int evsel__process_alloc_node_event(struct evsel *evsel, struct perf_samp
 	int ret = evsel__process_alloc_event(evsel, sample);
 
 	if (!ret) {
-		int node1 = cpu__get_node(sample->cpu),
+		int node1 = cpu__get_node((struct perf_cpu){.cpu = sample->cpu}),
 		    node2 = evsel__intval(evsel, sample, "node");
 
 		if (node1 != node2)
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 6ac2160913ea..0a63295d30f0 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -2796,7 +2796,7 @@ int cmd_record(int argc, const char **argv)
 	symbol__init(NULL);
 
 	if (rec->opts.affinity != PERF_AFFINITY_SYS) {
-		rec->affinity_mask.nbits = cpu__max_cpu();
+		rec->affinity_mask.nbits = cpu__max_cpu().cpu;
 		rec->affinity_mask.bits = bitmap_zalloc(rec->affinity_mask.nbits);
 		if (!rec->affinity_mask.bits) {
 			pr_err("Failed to allocate thread mask for %zd cpus\n", rec->affinity_mask.nbits);
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 9da1da4749c9..72d446de9c60 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -167,7 +167,7 @@ struct trace_sched_handler {
 
 struct perf_sched_map {
 	DECLARE_BITMAP(comp_cpus_mask, MAX_CPUS);
-	int			*comp_cpus;
+	struct perf_cpu		*comp_cpus;
 	bool			 comp;
 	struct perf_thread_map *color_pids;
 	const char		*color_pids_str;
@@ -191,7 +191,7 @@ struct perf_sched {
  * Track the current task - that way we can know whether there's any
  * weird events, such as a task being switched away that is not current.
  */
-	int		 max_cpu;
+	struct perf_cpu	 max_cpu;
 	u32		 curr_pid[MAX_CPUS];
 	struct thread	 *curr_thread[MAX_CPUS];
 	char		 next_shortname1;
@@ -1535,28 +1535,31 @@ static int map_switch_event(struct perf_sched *sched, struct evsel *evsel,
 	int new_shortname;
 	u64 timestamp0, timestamp = sample->time;
 	s64 delta;
-	int i, this_cpu = sample->cpu;
+	int i;
+	struct perf_cpu this_cpu = {
+		.cpu = sample->cpu,
+	};
 	int cpus_nr;
 	bool new_cpu = false;
 	const char *color = PERF_COLOR_NORMAL;
 	char stimestamp[32];
 
-	BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);
+	BUG_ON(this_cpu.cpu >= MAX_CPUS || this_cpu.cpu < 0);
 
-	if (this_cpu > sched->max_cpu)
+	if (this_cpu.cpu > sched->max_cpu.cpu)
 		sched->max_cpu = this_cpu;
 
 	if (sched->map.comp) {
 		cpus_nr = bitmap_weight(sched->map.comp_cpus_mask, MAX_CPUS);
-		if (!test_and_set_bit(this_cpu, sched->map.comp_cpus_mask)) {
+		if (!test_and_set_bit(this_cpu.cpu, sched->map.comp_cpus_mask)) {
 			sched->map.comp_cpus[cpus_nr++] = this_cpu;
 			new_cpu = true;
 		}
 	} else
-		cpus_nr = sched->max_cpu;
+		cpus_nr = sched->max_cpu.cpu;
 
-	timestamp0 = sched->cpu_last_switched[this_cpu];
-	sched->cpu_last_switched[this_cpu] = timestamp;
+	timestamp0 = sched->cpu_last_switched[this_cpu.cpu];
+	sched->cpu_last_switched[this_cpu.cpu] = timestamp;
 	if (timestamp0)
 		delta = timestamp - timestamp0;
 	else
@@ -1577,7 +1580,7 @@ static int map_switch_event(struct perf_sched *sched, struct evsel *evsel,
 		return -1;
 	}
 
-	sched->curr_thread[this_cpu] = thread__get(sched_in);
+	sched->curr_thread[this_cpu.cpu] = thread__get(sched_in);
 
 	printf("  ");
 
@@ -1608,8 +1611,10 @@ static int map_switch_event(struct perf_sched *sched, struct evsel *evsel,
 	}
 
 	for (i = 0; i < cpus_nr; i++) {
-		int cpu = sched->map.comp ? sched->map.comp_cpus[i] : i;
-		struct thread *curr_thread = sched->curr_thread[cpu];
+		struct perf_cpu cpu = {
+			.cpu = sched->map.comp ? sched->map.comp_cpus[i].cpu : i,
+		};
+		struct thread *curr_thread = sched->curr_thread[cpu.cpu];
 		struct thread_runtime *curr_tr;
 		const char *pid_color = color;
 		const char *cpu_color = color;
@@ -1623,13 +1628,13 @@ static int map_switch_event(struct perf_sched *sched, struct evsel *evsel,
 		if (sched->map.color_cpus && perf_cpu_map__has(sched->map.color_cpus, cpu))
 			cpu_color = COLOR_CPUS;
 
-		if (cpu != this_cpu)
+		if (cpu.cpu != this_cpu.cpu)
 			color_fprintf(stdout, color, " ");
 		else
 			color_fprintf(stdout, cpu_color, "*");
 
-		if (sched->curr_thread[cpu]) {
-			curr_tr = thread__get_runtime(sched->curr_thread[cpu]);
+		if (sched->curr_thread[cpu.cpu]) {
+			curr_tr = thread__get_runtime(sched->curr_thread[cpu.cpu]);
 			if (curr_tr == NULL) {
 				thread__put(sched_in);
 				return -1;
@@ -1929,7 +1934,7 @@ static char *timehist_get_commstr(struct thread *thread)
 
 static void timehist_header(struct perf_sched *sched)
 {
-	u32 ncpus = sched->max_cpu + 1;
+	u32 ncpus = sched->max_cpu.cpu + 1;
 	u32 i, j;
 
 	printf("%15s %6s ", "time", "cpu");
@@ -2008,7 +2013,7 @@ static void timehist_print_sample(struct perf_sched *sched,
 	struct thread_runtime *tr = thread__priv(thread);
 	const char *next_comm = evsel__strval(evsel, sample, "next_comm");
 	const u32 next_pid = evsel__intval(evsel, sample, "next_pid");
-	u32 max_cpus = sched->max_cpu + 1;
+	u32 max_cpus = sched->max_cpu.cpu + 1;
 	char tstr[64];
 	char nstr[30];
 	u64 wait_time;
@@ -2389,7 +2394,7 @@ static void timehist_print_wakeup_event(struct perf_sched *sched,
 	timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr));
 	printf("%15s [%04d] ", tstr, sample->cpu);
 	if (sched->show_cpu_visual)
-		printf(" %*s ", sched->max_cpu + 1, "");
+		printf(" %*s ", sched->max_cpu.cpu + 1, "");
 
 	printf(" %-*s ", comm_width, timehist_get_commstr(thread));
 
@@ -2449,13 +2454,13 @@ static void timehist_print_migration_event(struct perf_sched *sched,
 {
 	struct thread *thread;
 	char tstr[64];
-	u32 max_cpus = sched->max_cpu + 1;
+	u32 max_cpus;
 	u32 ocpu, dcpu;
 
 	if (sched->summary_only)
 		return;
 
-	max_cpus = sched->max_cpu + 1;
+	max_cpus = sched->max_cpu.cpu + 1;
 	ocpu = evsel__intval(evsel, sample, "orig_cpu");
 	dcpu = evsel__intval(evsel, sample, "dest_cpu");
 
@@ -2918,7 +2923,7 @@ static void timehist_print_summary(struct perf_sched *sched,
 
 	printf("    Total scheduling time (msec): ");
 	print_sched_time(hist_time, 2);
-	printf(" (x %d)\n", sched->max_cpu);
+	printf(" (x %d)\n", sched->max_cpu.cpu);
 }
 
 typedef int (*sched_handler)(struct perf_tool *tool,
@@ -2935,9 +2940,11 @@ static int perf_timehist__process_sample(struct perf_tool *tool,
 {
 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
 	int err = 0;
-	int this_cpu = sample->cpu;
+	struct perf_cpu this_cpu = {
+		.cpu = sample->cpu,
+	};
 
-	if (this_cpu > sched->max_cpu)
+	if (this_cpu.cpu > sched->max_cpu.cpu)
 		sched->max_cpu = this_cpu;
 
 	if (evsel->handler != NULL) {
@@ -3054,10 +3061,10 @@ static int perf_sched__timehist(struct perf_sched *sched)
 		goto out;
 
 	/* pre-allocate struct for per-CPU idle stats */
-	sched->max_cpu = session->header.env.nr_cpus_online;
-	if (sched->max_cpu == 0)
-		sched->max_cpu = 4;
-	if (init_idle_threads(sched->max_cpu))
+	sched->max_cpu.cpu = session->header.env.nr_cpus_online;
+	if (sched->max_cpu.cpu == 0)
+		sched->max_cpu.cpu = 4;
+	if (init_idle_threads(sched->max_cpu.cpu))
 		goto out;
 
 	/* summary_only implies summary option, but don't overwrite summary if set */
@@ -3209,10 +3216,10 @@ static int setup_map_cpus(struct perf_sched *sched)
 {
 	struct perf_cpu_map *map;
 
-	sched->max_cpu  = sysconf(_SC_NPROCESSORS_CONF);
+	sched->max_cpu.cpu  = sysconf(_SC_NPROCESSORS_CONF);
 
 	if (sched->map.comp) {
-		sched->map.comp_cpus = zalloc(sched->max_cpu * sizeof(int));
+		sched->map.comp_cpus = zalloc(sched->max_cpu.cpu * sizeof(int));
 		if (!sched->map.comp_cpus)
 			return -1;
 	}
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index bb43529618b3..ecd4f99a6c14 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -2115,7 +2115,8 @@ static struct scripting_ops	*scripting_ops;
 static void __process_stat(struct evsel *counter, u64 tstamp)
 {
 	int nthreads = perf_thread_map__nr(counter->core.threads);
-	int idx, cpu, thread;
+	int idx, thread;
+	struct perf_cpu cpu;
 	static int header_printed;
 
 	if (counter->core.system_wide)
@@ -2134,7 +2135,7 @@ static void __process_stat(struct evsel *counter, u64 tstamp)
 			counts = perf_counts(counter->counts, idx, thread);
 
 			printf("%3d %8d %15" PRIu64 " %15" PRIu64 " %15" PRIu64 " %15" PRIu64 " %s\n",
-				cpu,
+				cpu.cpu,
 				perf_thread_map__pid(counter->core.threads, thread),
 				counts->val,
 				counts->ena,
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index dfb8f7847e6c..973ade18b72a 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -234,7 +234,7 @@ static bool cpus_map_matched(struct evsel *a, struct evsel *b)
 		return false;
 
 	for (int i = 0; i < a->core.cpus->nr; i++) {
-		if (a->core.cpus->map[i] != b->core.cpus->map[i])
+		if (a->core.cpus->map[i].cpu != b->core.cpus->map[i].cpu)
 			return false;
 	}
 
@@ -331,7 +331,7 @@ static int evsel__write_stat_event(struct evsel *counter, int cpu_map_idx, u32 t
 				   struct perf_counts_values *count)
 {
 	struct perf_sample_id *sid = SID(counter, cpu_map_idx, thread);
-	int cpu = perf_cpu_map__cpu(evsel__cpus(counter), cpu_map_idx);
+	struct perf_cpu cpu = perf_cpu_map__cpu(evsel__cpus(counter), cpu_map_idx);
 
 	return perf_event__synthesize_stat(NULL, cpu, thread, sid->id, count,
 					   process_synthesized_event, NULL);
@@ -396,7 +396,8 @@ static int read_counter_cpu(struct evsel *counter, struct timespec *rs, int cpu_
 			fprintf(stat_config.output,
 				"%s: %d: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n",
 					evsel__name(counter),
-					perf_cpu_map__cpu(evsel__cpus(counter), cpu_map_idx),
+					perf_cpu_map__cpu(evsel__cpus(counter),
+							  cpu_map_idx).cpu,
 					count->val, count->ena, count->run);
 		}
 	}
@@ -1328,61 +1329,61 @@ static const char *const aggr_mode__string[] = {
 };
 
 static struct aggr_cpu_id perf_stat__get_socket(struct perf_stat_config *config __maybe_unused,
-						int cpu)
+						struct perf_cpu cpu)
 {
 	return aggr_cpu_id__socket(cpu, /*data=*/NULL);
 }
 
 static struct aggr_cpu_id perf_stat__get_die(struct perf_stat_config *config __maybe_unused,
-					     int cpu)
+					     struct perf_cpu cpu)
 {
 	return aggr_cpu_id__die(cpu, /*data=*/NULL);
 }
 
 static struct aggr_cpu_id perf_stat__get_core(struct perf_stat_config *config __maybe_unused,
-					      int cpu)
+					      struct perf_cpu cpu)
 {
 	return aggr_cpu_id__core(cpu, /*data=*/NULL);
 }
 
 static struct aggr_cpu_id perf_stat__get_node(struct perf_stat_config *config __maybe_unused,
-					      int cpu)
+					      struct perf_cpu cpu)
 {
 	return aggr_cpu_id__node(cpu, /*data=*/NULL);
 }
 
 static struct aggr_cpu_id perf_stat__get_aggr(struct perf_stat_config *config,
-					      aggr_get_id_t get_id, int cpu)
+					      aggr_get_id_t get_id, struct perf_cpu cpu)
 {
 	struct aggr_cpu_id id = aggr_cpu_id__empty();
 
-	if (aggr_cpu_id__is_empty(&config->cpus_aggr_map->map[cpu]))
-		config->cpus_aggr_map->map[cpu] = get_id(config, cpu);
+	if (aggr_cpu_id__is_empty(&config->cpus_aggr_map->map[cpu.cpu]))
+		config->cpus_aggr_map->map[cpu.cpu] = get_id(config, cpu);
 
-	id = config->cpus_aggr_map->map[cpu];
+	id = config->cpus_aggr_map->map[cpu.cpu];
 	return id;
 }
 
 static struct aggr_cpu_id perf_stat__get_socket_cached(struct perf_stat_config *config,
-						       int cpu)
+						       struct perf_cpu cpu)
 {
 	return perf_stat__get_aggr(config, perf_stat__get_socket, cpu);
 }
 
 static struct aggr_cpu_id perf_stat__get_die_cached(struct perf_stat_config *config,
-						    int cpu)
+						    struct perf_cpu cpu)
 {
 	return perf_stat__get_aggr(config, perf_stat__get_die, cpu);
 }
 
 static struct aggr_cpu_id perf_stat__get_core_cached(struct perf_stat_config *config,
-						     int cpu)
+						     struct perf_cpu cpu)
 {
 	return perf_stat__get_aggr(config, perf_stat__get_core, cpu);
 }
 
 static struct aggr_cpu_id perf_stat__get_node_cached(struct perf_stat_config *config,
-						     int cpu)
+						     struct perf_cpu cpu)
 {
 	return perf_stat__get_aggr(config, perf_stat__get_node, cpu);
 }
@@ -1467,7 +1468,7 @@ static int perf_stat_init_aggr_mode(void)
 	 * taking the highest cpu number to be the size of
 	 * the aggregation translate cpumap.
 	 */
-	nr = perf_cpu_map__max(evsel_list->core.cpus);
+	nr = perf_cpu_map__max(evsel_list->core.cpus).cpu;
 	stat_config.cpus_aggr_map = cpu_aggr_map__empty_new(nr + 1);
 	return stat_config.cpus_aggr_map ? 0 : -ENOMEM;
 }
@@ -1495,55 +1496,55 @@ static void perf_stat__exit_aggr_mode(void)
 	stat_config.cpus_aggr_map = NULL;
 }
 
-static struct aggr_cpu_id perf_env__get_socket_aggr_by_cpu(int cpu, void *data)
+static struct aggr_cpu_id perf_env__get_socket_aggr_by_cpu(struct perf_cpu cpu, void *data)
 {
 	struct perf_env *env = data;
 	struct aggr_cpu_id id = aggr_cpu_id__empty();
 
-	if (cpu != -1)
-		id.socket = env->cpu[cpu].socket_id;
+	if (cpu.cpu != -1)
+		id.socket = env->cpu[cpu.cpu].socket_id;
 
 	return id;
 }
 
-static struct aggr_cpu_id perf_env__get_die_aggr_by_cpu(int cpu, void *data)
+static struct aggr_cpu_id perf_env__get_die_aggr_by_cpu(struct perf_cpu cpu, void *data)
 {
 	struct perf_env *env = data;
 	struct aggr_cpu_id id = aggr_cpu_id__empty();
 
-	if (cpu != -1) {
+	if (cpu.cpu != -1) {
 		/*
 		 * die_id is relative to socket, so start
 		 * with the socket ID and then add die to
 		 * make a unique ID.
 		 */
-		id.socket = env->cpu[cpu].socket_id;
-		id.die = env->cpu[cpu].die_id;
+		id.socket = env->cpu[cpu.cpu].socket_id;
+		id.die = env->cpu[cpu.cpu].die_id;
 	}
 
 	return id;
 }
 
-static struct aggr_cpu_id perf_env__get_core_aggr_by_cpu(int cpu, void *data)
+static struct aggr_cpu_id perf_env__get_core_aggr_by_cpu(struct perf_cpu cpu, void *data)
 {
 	struct perf_env *env = data;
 	struct aggr_cpu_id id = aggr_cpu_id__empty();
 
-	if (cpu != -1) {
+	if (cpu.cpu != -1) {
 		/*
 		 * core_id is relative to socket and die,
 		 * we need a global id. So we set
 		 * socket, die id and core id
 		 */
-		id.socket = env->cpu[cpu].socket_id;
-		id.die = env->cpu[cpu].die_id;
-		id.core = env->cpu[cpu].core_id;
+		id.socket = env->cpu[cpu.cpu].socket_id;
+		id.die = env->cpu[cpu.cpu].die_id;
+		id.core = env->cpu[cpu.cpu].core_id;
 	}
 
 	return id;
 }
 
-static struct aggr_cpu_id perf_env__get_node_aggr_by_cpu(int cpu, void *data)
+static struct aggr_cpu_id perf_env__get_node_aggr_by_cpu(struct perf_cpu cpu, void *data)
 {
 	struct aggr_cpu_id id = aggr_cpu_id__empty();
 
@@ -1552,24 +1553,24 @@ static struct aggr_cpu_id perf_env__get_node_aggr_by_cpu(int cpu, void *data)
 }
 
 static struct aggr_cpu_id perf_stat__get_socket_file(struct perf_stat_config *config __maybe_unused,
-						     int cpu)
+						     struct perf_cpu cpu)
 {
 	return perf_env__get_socket_aggr_by_cpu(cpu, &perf_stat.session->header.env);
 }
 static struct aggr_cpu_id perf_stat__get_die_file(struct perf_stat_config *config __maybe_unused,
-						  int cpu)
+						  struct perf_cpu cpu)
 {
 	return perf_env__get_die_aggr_by_cpu(cpu, &perf_stat.session->header.env);
 }
 
 static struct aggr_cpu_id perf_stat__get_core_file(struct perf_stat_config *config __maybe_unused,
-						   int cpu)
+						   struct perf_cpu cpu)
 {
 	return perf_env__get_core_aggr_by_cpu(cpu, &perf_stat.session->header.env);
 }
 
 static struct aggr_cpu_id perf_stat__get_node_file(struct perf_stat_config *config __maybe_unused,
-						   int cpu)
+						   struct perf_cpu cpu)
 {
 	return perf_env__get_node_aggr_by_cpu(cpu, &perf_stat.session->header.env);
 }
diff --git a/tools/perf/tests/attr.c b/tools/perf/tests/attr.c
index 0f73e300f207..56fba08a3037 100644
--- a/tools/perf/tests/attr.c
+++ b/tools/perf/tests/attr.c
@@ -65,7 +65,7 @@ do {									\
 
 #define WRITE_ASS(field, fmt) __WRITE_ASS(field, fmt, attr->field)
 
-static int store_event(struct perf_event_attr *attr, pid_t pid, int cpu,
+static int store_event(struct perf_event_attr *attr, pid_t pid, struct perf_cpu cpu,
 		       int fd, int group_fd, unsigned long flags)
 {
 	FILE *file;
@@ -93,7 +93,7 @@ static int store_event(struct perf_event_attr *attr, pid_t pid, int cpu,
 	/* syscall arguments */
 	__WRITE_ASS(fd,       "d", fd);
 	__WRITE_ASS(group_fd, "d", group_fd);
-	__WRITE_ASS(cpu,      "d", cpu);
+	__WRITE_ASS(cpu,      "d", cpu.cpu);
 	__WRITE_ASS(pid,      "d", pid);
 	__WRITE_ASS(flags,   "lu", flags);
 
@@ -144,7 +144,7 @@ static int store_event(struct perf_event_attr *attr, pid_t pid, int cpu,
 	return 0;
 }
 
-void test_attr__open(struct perf_event_attr *attr, pid_t pid, int cpu,
+void test_attr__open(struct perf_event_attr *attr, pid_t pid, struct perf_cpu cpu,
 		     int fd, int group_fd, unsigned long flags)
 {
 	int errno_saved = errno;
diff --git a/tools/perf/tests/bitmap.c b/tools/perf/tests/bitmap.c
index 384856347236..0bf399c49849 100644
--- a/tools/perf/tests/bitmap.c
+++ b/tools/perf/tests/bitmap.c
@@ -18,7 +18,7 @@ static unsigned long *get_bitmap(const char *str, int nbits)
 
 	if (map && bm) {
 		for (i = 0; i < map->nr; i++)
-			set_bit(map->map[i], bm);
+			set_bit(map->map[i].cpu, bm);
 	}
 
 	if (map)
diff --git a/tools/perf/tests/cpumap.c b/tools/perf/tests/cpumap.c
index 89a155092f85..84e87e31f119 100644
--- a/tools/perf/tests/cpumap.c
+++ b/tools/perf/tests/cpumap.c
@@ -38,7 +38,7 @@ static int process_event_mask(struct perf_tool *tool __maybe_unused,
 	TEST_ASSERT_VAL("wrong nr",  map->nr == 20);
 
 	for (i = 0; i < 20; i++) {
-		TEST_ASSERT_VAL("wrong cpu", map->map[i] == i);
+		TEST_ASSERT_VAL("wrong cpu", map->map[i].cpu == i);
 	}
 
 	perf_cpu_map__put(map);
@@ -67,8 +67,8 @@ static int process_event_cpus(struct perf_tool *tool __maybe_unused,
 
 	map = cpu_map__new_data(data);
 	TEST_ASSERT_VAL("wrong nr",  map->nr == 2);
-	TEST_ASSERT_VAL("wrong cpu", map->map[0] == 1);
-	TEST_ASSERT_VAL("wrong cpu", map->map[1] == 256);
+	TEST_ASSERT_VAL("wrong cpu", map->map[0].cpu == 1);
+	TEST_ASSERT_VAL("wrong cpu", map->map[1].cpu == 256);
 	TEST_ASSERT_VAL("wrong refcnt", refcount_read(&map->refcnt) == 1);
 	perf_cpu_map__put(map);
 	return 0;
diff --git a/tools/perf/tests/event_update.c b/tools/perf/tests/event_update.c
index d01532d40acb..16b6d6f47f38 100644
--- a/tools/perf/tests/event_update.c
+++ b/tools/perf/tests/event_update.c
@@ -76,9 +76,9 @@ static int process_event_cpus(struct perf_tool *tool __maybe_unused,
 	TEST_ASSERT_VAL("wrong id", ev->id == 123);
 	TEST_ASSERT_VAL("wrong type", ev->type == PERF_EVENT_UPDATE__CPUS);
 	TEST_ASSERT_VAL("wrong cpus", map->nr == 3);
-	TEST_ASSERT_VAL("wrong cpus", map->map[0] == 1);
-	TEST_ASSERT_VAL("wrong cpus", map->map[1] == 2);
-	TEST_ASSERT_VAL("wrong cpus", map->map[2] == 3);
+	TEST_ASSERT_VAL("wrong cpus", map->map[0].cpu == 1);
+	TEST_ASSERT_VAL("wrong cpus", map->map[1].cpu == 2);
+	TEST_ASSERT_VAL("wrong cpus", map->map[2].cpu == 3);
 	perf_cpu_map__put(map);
 	return 0;
 }
diff --git a/tools/perf/tests/mem2node.c b/tools/perf/tests/mem2node.c
index b17b86391383..f4a4aba33f76 100644
--- a/tools/perf/tests/mem2node.c
+++ b/tools/perf/tests/mem2node.c
@@ -31,7 +31,7 @@ static unsigned long *get_bitmap(const char *str, int nbits)
 
 	if (map && bm) {
 		for (i = 0; i < map->nr; i++) {
-			set_bit(map->map[i], bm);
+			set_bit(map->map[i].cpu, bm);
 		}
 	}
 
diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c
index 90b2feda31ac..0ad62914b4d7 100644
--- a/tools/perf/tests/mmap-basic.c
+++ b/tools/perf/tests/mmap-basic.c
@@ -59,11 +59,11 @@ static int test__basic_mmap(struct test_suite *test __maybe_unused, int subtest
 	}
 
 	CPU_ZERO(&cpu_set);
-	CPU_SET(cpus->map[0], &cpu_set);
+	CPU_SET(cpus->map[0].cpu, &cpu_set);
 	sched_setaffinity(0, sizeof(cpu_set), &cpu_set);
 	if (sched_setaffinity(0, sizeof(cpu_set), &cpu_set) < 0) {
 		pr_debug("sched_setaffinity() failed on CPU %d: %s ",
-			 cpus->map[0], str_error_r(errno, sbuf, sizeof(sbuf)));
+			 cpus->map[0].cpu, str_error_r(errno, sbuf, sizeof(sbuf)));
 		goto out_free_cpus;
 	}
 
diff --git a/tools/perf/tests/openat-syscall-all-cpus.c b/tools/perf/tests/openat-syscall-all-cpus.c
index ca0a50e92839..1ab362323d25 100644
--- a/tools/perf/tests/openat-syscall-all-cpus.c
+++ b/tools/perf/tests/openat-syscall-all-cpus.c
@@ -22,7 +22,8 @@
 static int test__openat_syscall_event_on_all_cpus(struct test_suite *test __maybe_unused,
 						  int subtest __maybe_unused)
 {
-	int err = -1, fd, idx, cpu;
+	int err = -1, fd, idx;
+	struct perf_cpu cpu;
 	struct perf_cpu_map *cpus;
 	struct evsel *evsel;
 	unsigned int nr_openat_calls = 111, i;
@@ -66,15 +67,15 @@ static int test__openat_syscall_event_on_all_cpus(struct test_suite *test __mayb
 		 * without CPU_ALLOC. 1024 cpus in 2010 still seems
 		 * a reasonable upper limit tho :-)
 		 */
-		if (cpu >= CPU_SETSIZE) {
-			pr_debug("Ignoring CPU %d\n", cpu);
+		if (cpu.cpu >= CPU_SETSIZE) {
+			pr_debug("Ignoring CPU %d\n", cpu.cpu);
 			continue;
 		}
 
-		CPU_SET(cpu, &cpu_set);
+		CPU_SET(cpu.cpu, &cpu_set);
 		if (sched_setaffinity(0, sizeof(cpu_set), &cpu_set) < 0) {
 			pr_debug("sched_setaffinity() failed on CPU %d: %s ",
-				 cpu,
+				 cpu.cpu,
 				 str_error_r(errno, sbuf, sizeof(sbuf)));
 			goto out_close_fd;
 		}
@@ -82,7 +83,7 @@ static int test__openat_syscall_event_on_all_cpus(struct test_suite *test __mayb
 			fd = openat(0, "/etc/passwd", O_RDONLY);
 			close(fd);
 		}
-		CPU_CLR(cpu, &cpu_set);
+		CPU_CLR(cpu.cpu, &cpu_set);
 	}
 
 	evsel->core.cpus = perf_cpu_map__get(cpus);
@@ -92,7 +93,7 @@ static int test__openat_syscall_event_on_all_cpus(struct test_suite *test __mayb
 	perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
 		unsigned int expected;
 
-		if (cpu >= CPU_SETSIZE)
+		if (cpu.cpu >= CPU_SETSIZE)
 			continue;
 
 		if (evsel__read_on_cpu(evsel, idx, 0) < 0) {
@@ -104,7 +105,7 @@ static int test__openat_syscall_event_on_all_cpus(struct test_suite *test __mayb
 		expected = nr_openat_calls + idx;
 		if (perf_counts(evsel->counts, idx, 0)->val != expected) {
 			pr_debug("evsel__read_on_cpu: expected to intercept %d calls on cpu %d, got %" PRIu64 "\n",
-				 expected, cpu, perf_counts(evsel->counts, idx, 0)->val);
+				 expected, cpu.cpu, perf_counts(evsel->counts, idx, 0)->val);
 			err = -1;
 		}
 	}
diff --git a/tools/perf/tests/stat.c b/tools/perf/tests/stat.c
index 2eb096b5e6da..500974040fe3 100644
--- a/tools/perf/tests/stat.c
+++ b/tools/perf/tests/stat.c
@@ -87,7 +87,8 @@ static int test__synthesize_stat(struct test_suite *test __maybe_unused, int sub
 	count.run = 300;
 
 	TEST_ASSERT_VAL("failed to synthesize stat_config",
-		!perf_event__synthesize_stat(NULL, 1, 2, 3, &count, process_stat_event, NULL));
+			!perf_event__synthesize_stat(NULL, (struct perf_cpu){.cpu = 1}, 2, 3,
+						     &count, process_stat_event, NULL));
 
 	return 0;
 }
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 33e4cb81265c..c4ef0c7002f1 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -112,7 +112,9 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map)
 	TEST_ASSERT_VAL("Session header CPU map not set", session->header.env.cpu);
 
 	for (i = 0; i < session->header.env.nr_cpus_avail; i++) {
-		if (!perf_cpu_map__has(map, i))
+		struct perf_cpu cpu = { .cpu = i };
+
+		if (!perf_cpu_map__has(map, cpu))
 			continue;
 		pr_debug("CPU %d, core %d, socket %d\n", i,
 			 session->header.env.cpu[i].core_id,
@@ -122,15 +124,15 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map)
 	// Test that CPU ID contains socket, die, core and CPU
 	for (i = 0; i < map->nr; i++) {
 		id = aggr_cpu_id__cpu(perf_cpu_map__cpu(map, i), NULL);
-		TEST_ASSERT_VAL("Cpu map - CPU ID doesn't match", map->map[i] == id.cpu);
+		TEST_ASSERT_VAL("Cpu map - CPU ID doesn't match", map->map[i].cpu == id.cpu.cpu);
 
 		TEST_ASSERT_VAL("Cpu map - Core ID doesn't match",
-			session->header.env.cpu[map->map[i]].core_id == id.core);
+			session->header.env.cpu[map->map[i].cpu].core_id == id.core);
 		TEST_ASSERT_VAL("Cpu map - Socket ID doesn't match",
-			session->header.env.cpu[map->map[i]].socket_id == id.socket);
+			session->header.env.cpu[map->map[i].cpu].socket_id == id.socket);
 
 		TEST_ASSERT_VAL("Cpu map - Die ID doesn't match",
-			session->header.env.cpu[map->map[i]].die_id == id.die);
+			session->header.env.cpu[map->map[i].cpu].die_id == id.die);
 		TEST_ASSERT_VAL("Cpu map - Node ID is set", id.node == -1);
 		TEST_ASSERT_VAL("Cpu map - Thread is set", id.thread == -1);
 	}
@@ -139,13 +141,13 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map)
 	for (i = 0; i < map->nr; i++) {
 		id = aggr_cpu_id__core(perf_cpu_map__cpu(map, i), NULL);
 		TEST_ASSERT_VAL("Core map - Core ID doesn't match",
-			session->header.env.cpu[map->map[i]].core_id == id.core);
+			session->header.env.cpu[map->map[i].cpu].core_id == id.core);
 
 		TEST_ASSERT_VAL("Core map - Socket ID doesn't match",
-			session->header.env.cpu[map->map[i]].socket_id == id.socket);
+			session->header.env.cpu[map->map[i].cpu].socket_id == id.socket);
 
 		TEST_ASSERT_VAL("Core map - Die ID doesn't match",
-			session->header.env.cpu[map->map[i]].die_id == id.die);
+			session->header.env.cpu[map->map[i].cpu].die_id == id.die);
 		TEST_ASSERT_VAL("Core map - Node ID is set", id.node == -1);
 		TEST_ASSERT_VAL("Core map - Thread is set", id.thread == -1);
 	}
@@ -154,14 +156,14 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map)
 	for (i = 0; i < map->nr; i++) {
 		id = aggr_cpu_id__die(perf_cpu_map__cpu(map, i), NULL);
 		TEST_ASSERT_VAL("Die map - Socket ID doesn't match",
-			session->header.env.cpu[map->map[i]].socket_id == id.socket);
+			session->header.env.cpu[map->map[i].cpu].socket_id == id.socket);
 
 		TEST_ASSERT_VAL("Die map - Die ID doesn't match",
-			session->header.env.cpu[map->map[i]].die_id == id.die);
+			session->header.env.cpu[map->map[i].cpu].die_id == id.die);
 
 		TEST_ASSERT_VAL("Die map - Node ID is set", id.node == -1);
 		TEST_ASSERT_VAL("Die map - Core is set", id.core == -1);
-		TEST_ASSERT_VAL("Die map - CPU is set", id.cpu == -1);
+		TEST_ASSERT_VAL("Die map - CPU is set", id.cpu.cpu == -1);
 		TEST_ASSERT_VAL("Die map - Thread is set", id.thread == -1);
 	}
 
@@ -169,12 +171,12 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map)
 	for (i = 0; i < map->nr; i++) {
 		id = aggr_cpu_id__socket(perf_cpu_map__cpu(map, i), NULL);
 		TEST_ASSERT_VAL("Socket map - Socket ID doesn't match",
-			session->header.env.cpu[map->map[i]].socket_id == id.socket);
+			session->header.env.cpu[map->map[i].cpu].socket_id == id.socket);
 
 		TEST_ASSERT_VAL("Socket map - Node ID is set", id.node == -1);
 		TEST_ASSERT_VAL("Socket map - Die ID is set", id.die == -1);
 		TEST_ASSERT_VAL("Socket map - Core is set", id.core == -1);
-		TEST_ASSERT_VAL("Socket map - CPU is set", id.cpu == -1);
+		TEST_ASSERT_VAL("Socket map - CPU is set", id.cpu.cpu == -1);
 		TEST_ASSERT_VAL("Socket map - Thread is set", id.thread == -1);
 	}
 
@@ -186,7 +188,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map)
 		TEST_ASSERT_VAL("Node map - Socket is set", id.socket == -1);
 		TEST_ASSERT_VAL("Node map - Die ID is set", id.die == -1);
 		TEST_ASSERT_VAL("Node map - Core is set", id.core == -1);
-		TEST_ASSERT_VAL("Node map - CPU is set", id.cpu == -1);
+		TEST_ASSERT_VAL("Node map - CPU is set", id.cpu.cpu == -1);
 		TEST_ASSERT_VAL("Node map - Thread is set", id.thread == -1);
 	}
 	perf_session__delete(session);
diff --git a/tools/perf/util/affinity.c b/tools/perf/util/affinity.c
index 7b12bd7a3080..f1e30d566db3 100644
--- a/tools/perf/util/affinity.c
+++ b/tools/perf/util/affinity.c
@@ -11,7 +11,7 @@
 
 static int get_cpu_set_size(void)
 {
-	int sz = cpu__max_cpu() + 8 - 1;
+	int sz = cpu__max_cpu().cpu + 8 - 1;
 	/*
 	 * sched_getaffinity doesn't like masks smaller than the kernel.
 	 * Hopefully that's big enough.
diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c
index c679394b898d..5632efc44738 100644
--- a/tools/perf/util/auxtrace.c
+++ b/tools/perf/util/auxtrace.c
@@ -123,7 +123,7 @@ int auxtrace_mmap__mmap(struct auxtrace_mmap *mm,
 	mm->prev = 0;
 	mm->idx = mp->idx;
 	mm->tid = mp->tid;
-	mm->cpu = mp->cpu;
+	mm->cpu = mp->cpu.cpu;
 
 	if (!mp->len) {
 		mm->base = NULL;
@@ -180,7 +180,7 @@ void auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp,
 		else
 			mp->tid = -1;
 	} else {
-		mp->cpu = -1;
+		mp->cpu.cpu = -1;
 		mp->tid = perf_thread_map__pid(evlist->core.threads, idx);
 	}
 }
@@ -292,7 +292,7 @@ static int auxtrace_queues__queue_buffer(struct auxtrace_queues *queues,
 	if (!queue->set) {
 		queue->set = true;
 		queue->tid = buffer->tid;
-		queue->cpu = buffer->cpu;
+		queue->cpu = buffer->cpu.cpu;
 	}
 
 	buffer->buffer_nr = queues->next_buffer_nr++;
@@ -339,11 +339,11 @@ static int auxtrace_queues__split_buffer(struct auxtrace_queues *queues,
 	return 0;
 }
 
-static bool filter_cpu(struct perf_session *session, int cpu)
+static bool filter_cpu(struct perf_session *session, struct perf_cpu cpu)
 {
 	unsigned long *cpu_bitmap = session->itrace_synth_opts->cpu_bitmap;
 
-	return cpu_bitmap && cpu != -1 && !test_bit(cpu, cpu_bitmap);
+	return cpu_bitmap && cpu.cpu != -1 && !test_bit(cpu.cpu, cpu_bitmap);
 }
 
 static int auxtrace_queues__add_buffer(struct auxtrace_queues *queues,
@@ -399,7 +399,7 @@ int auxtrace_queues__add_event(struct auxtrace_queues *queues,
 	struct auxtrace_buffer buffer = {
 		.pid = -1,
 		.tid = event->auxtrace.tid,
-		.cpu = event->auxtrace.cpu,
+		.cpu = { event->auxtrace.cpu },
 		.data_offset = data_offset,
 		.offset = event->auxtrace.offset,
 		.reference = event->auxtrace.reference,
diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h
index bbf0d78c6401..19910b9011f3 100644
--- a/tools/perf/util/auxtrace.h
+++ b/tools/perf/util/auxtrace.h
@@ -15,6 +15,7 @@
 #include <linux/list.h>
 #include <linux/perf_event.h>
 #include <linux/types.h>
+#include <internal/cpumap.h>
 #include <asm/bitsperlong.h>
 #include <asm/barrier.h>
 
@@ -240,7 +241,7 @@ struct auxtrace_buffer {
 	size_t			size;
 	pid_t			pid;
 	pid_t			tid;
-	int			cpu;
+	struct perf_cpu		cpu;
 	void			*data;
 	off_t			data_offset;
 	void			*mmap_addr;
@@ -350,7 +351,7 @@ struct auxtrace_mmap_params {
 	int		prot;
 	int		idx;
 	pid_t		tid;
-	int		cpu;
+	struct perf_cpu	cpu;
 };
 
 /**
diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c
index 80d1a3a31052..328479df5e16 100644
--- a/tools/perf/util/bpf_counter.c
+++ b/tools/perf/util/bpf_counter.c
@@ -540,7 +540,7 @@ static int bperf__load(struct evsel *evsel, struct target *target)
 		    filter_type == BPERF_FILTER_TGID)
 			key = evsel->core.threads->map[i].pid;
 		else if (filter_type == BPERF_FILTER_CPU)
-			key = evsel->core.cpus->map[i];
+			key = evsel->core.cpus->map[i].cpu;
 		else
 			break;
 
@@ -584,7 +584,7 @@ static int bperf_sync_counters(struct evsel *evsel)
 
 	num_cpu = all_cpu_map->nr;
 	for (i = 0; i < num_cpu; i++) {
-		cpu = all_cpu_map->map[i];
+		cpu = all_cpu_map->map[i].cpu;
 		bperf_trigger_reading(evsel->bperf_leader_prog_fd, cpu);
 	}
 	return 0;
@@ -605,7 +605,7 @@ static int bperf__disable(struct evsel *evsel)
 static int bperf__read(struct evsel *evsel)
 {
 	struct bperf_follower_bpf *skel = evsel->follower_skel;
-	__u32 num_cpu_bpf = cpu__max_cpu();
+	__u32 num_cpu_bpf = cpu__max_cpu().cpu;
 	struct bpf_perf_event_value values[num_cpu_bpf];
 	int reading_map_fd, err = 0;
 	__u32 i;
@@ -615,6 +615,7 @@ static int bperf__read(struct evsel *evsel)
 	reading_map_fd = bpf_map__fd(skel->maps.accum_readings);
 
 	for (i = 0; i < bpf_map__max_entries(skel->maps.accum_readings); i++) {
+		struct perf_cpu entry;
 		__u32 cpu;
 
 		err = bpf_map_lookup_elem(reading_map_fd, &i, values);
@@ -624,14 +625,15 @@ static int bperf__read(struct evsel *evsel)
 		case BPERF_FILTER_GLOBAL:
 			assert(i == 0);
 
-			perf_cpu_map__for_each_cpu(cpu, j, all_cpu_map) {
+			perf_cpu_map__for_each_cpu(entry, j, all_cpu_map) {
+				cpu = entry.cpu;
 				perf_counts(evsel->counts, cpu, 0)->val = values[cpu].counter;
 				perf_counts(evsel->counts, cpu, 0)->ena = values[cpu].enabled;
 				perf_counts(evsel->counts, cpu, 0)->run = values[cpu].running;
 			}
 			break;
 		case BPERF_FILTER_CPU:
-			cpu = evsel->core.cpus->map[i];
+			cpu = evsel->core.cpus->map[i].cpu;
 			perf_counts(evsel->counts, i, 0)->val = values[cpu].counter;
 			perf_counts(evsel->counts, i, 0)->ena = values[cpu].enabled;
 			perf_counts(evsel->counts, i, 0)->run = values[cpu].running;
diff --git a/tools/perf/util/bpf_counter_cgroup.c b/tools/perf/util/bpf_counter_cgroup.c
index cbc6c2bca488..631e34a0b66f 100644
--- a/tools/perf/util/bpf_counter_cgroup.c
+++ b/tools/perf/util/bpf_counter_cgroup.c
@@ -48,7 +48,7 @@ static int bperf_load_program(struct evlist *evlist)
 	struct cgroup *cgrp, *leader_cgrp;
 	__u32 i, cpu;
 	__u32 nr_cpus = evlist->core.all_cpus->nr;
-	int total_cpus = cpu__max_cpu();
+	int total_cpus = cpu__max_cpu().cpu;
 	int map_size, map_fd;
 	int prog_fd, err;
 
@@ -125,7 +125,7 @@ static int bperf_load_program(struct evlist *evlist)
 			for (cpu = 0; cpu < nr_cpus; cpu++) {
 				int fd = FD(evsel, cpu);
 				__u32 idx = evsel->core.idx * total_cpus +
-					evlist->core.all_cpus->map[cpu];
+					evlist->core.all_cpus->map[cpu].cpu;
 
 				err = bpf_map_update_elem(map_fd, &idx, &fd,
 							  BPF_ANY);
@@ -212,7 +212,7 @@ static int bperf_cgrp__sync_counters(struct evlist *evlist)
 	int prog_fd = bpf_program__fd(skel->progs.trigger_read);
 
 	for (i = 0; i < nr_cpus; i++) {
-		cpu = evlist->core.all_cpus->map[i];
+		cpu = evlist->core.all_cpus->map[i].cpu;
 		bperf_trigger_reading(prog_fd, cpu);
 	}
 
@@ -245,7 +245,7 @@ static int bperf_cgrp__read(struct evsel *evsel)
 {
 	struct evlist *evlist = evsel->evlist;
 	int i, cpu, nr_cpus = evlist->core.all_cpus->nr;
-	int total_cpus = cpu__max_cpu();
+	int total_cpus = cpu__max_cpu().cpu;
 	struct perf_counts_values *counts;
 	struct bpf_perf_event_value *values;
 	int reading_map_fd, err = 0;
@@ -272,7 +272,7 @@ static int bperf_cgrp__read(struct evsel *evsel)
 		}
 
 		for (i = 0; i < nr_cpus; i++) {
-			cpu = evlist->core.all_cpus->map[i];
+			cpu = evlist->core.all_cpus->map[i].cpu;
 
 			counts = perf_counts(evsel->counts, i, 0);
 			counts->val = values[cpu].counter;
diff --git a/tools/perf/util/bpf_ftrace.c b/tools/perf/util/bpf_ftrace.c
index 28dc4c60c788..d756cc66eef3 100644
--- a/tools/perf/util/bpf_ftrace.c
+++ b/tools/perf/util/bpf_ftrace.c
@@ -63,7 +63,7 @@ int perf_ftrace__latency_prepare_bpf(struct perf_ftrace *ftrace)
 		fd = bpf_map__fd(skel->maps.cpu_filter);
 
 		for (i = 0; i < ncpus; i++) {
-			cpu = perf_cpu_map__cpu(ftrace->evlist->core.cpus, i);
+			cpu = perf_cpu_map__cpu(ftrace->evlist->core.cpus, i).cpu;
 			bpf_map_update_elem(fd, &cpu, &val, BPF_ANY);
 		}
 	}
@@ -122,7 +122,7 @@ int perf_ftrace__latency_read_bpf(struct perf_ftrace *ftrace __maybe_unused,
 	int i, fd, err;
 	u32 idx;
 	u64 *hist;
-	int ncpus = cpu__max_cpu();
+	int ncpus = cpu__max_cpu().cpu;
 
 	fd = bpf_map__fd(skel->maps.latency);
 
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 48ce583af0ec..12b2243222b0 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -13,8 +13,8 @@
 #include <linux/ctype.h>
 #include <linux/zalloc.h>
 
-static int max_cpu_num;
-static int max_present_cpu_num;
+static struct perf_cpu max_cpu_num;
+static struct perf_cpu max_present_cpu_num;
 static int max_node_num;
 /**
  * The numa node X as read from /sys/devices/system/node/nodeX indexed by the
@@ -37,9 +37,9 @@ static struct perf_cpu_map *cpu_map__from_entries(struct cpu_map_entries *cpus)
 			 * otherwise it would become 65535.
 			 */
 			if (cpus->cpu[i] == (u16) -1)
-				map->map[i] = -1;
+				map->map[i].cpu = -1;
 			else
-				map->map[i] = (int) cpus->cpu[i];
+				map->map[i].cpu = (int) cpus->cpu[i];
 		}
 	}
 
@@ -58,7 +58,7 @@ static struct perf_cpu_map *cpu_map__from_mask(struct perf_record_record_cpu_map
 		int cpu, i = 0;
 
 		for_each_set_bit(cpu, mask->mask, nbits)
-			map->map[i++] = cpu;
+			map->map[i++].cpu = cpu;
 	}
 	return map;
 
@@ -91,7 +91,7 @@ struct perf_cpu_map *perf_cpu_map__empty_new(int nr)
 
 		cpus->nr = nr;
 		for (i = 0; i < nr; i++)
-			cpus->map[i] = -1;
+			cpus->map[i].cpu = -1;
 
 		refcount_set(&cpus->refcnt, 1);
 	}
@@ -126,13 +126,13 @@ static int cpu__get_topology_int(int cpu, const char *name, int *value)
 	return sysfs__read_int(path, value);
 }
 
-int cpu__get_socket_id(int cpu)
+int cpu__get_socket_id(struct perf_cpu cpu)
 {
-	int value, ret = cpu__get_topology_int(cpu, "physical_package_id", &value);
+	int value, ret = cpu__get_topology_int(cpu.cpu, "physical_package_id", &value);
 	return ret ?: value;
 }
 
-struct aggr_cpu_id aggr_cpu_id__socket(int cpu, void *data __maybe_unused)
+struct aggr_cpu_id aggr_cpu_id__socket(struct perf_cpu cpu, void *data __maybe_unused)
 {
 	struct aggr_cpu_id id = aggr_cpu_id__empty();
 
@@ -161,7 +161,8 @@ struct cpu_aggr_map *cpu_aggr_map__new(const struct perf_cpu_map *cpus,
 				       aggr_cpu_id_get_t get_id,
 				       void *data)
 {
-	int cpu, idx;
+	int idx;
+	struct perf_cpu cpu;
 	struct cpu_aggr_map *c = cpu_aggr_map__empty_new(cpus->nr);
 
 	if (!c)
@@ -201,14 +202,14 @@ struct cpu_aggr_map *cpu_aggr_map__new(const struct perf_cpu_map *cpus,
 
 }
 
-int cpu__get_die_id(int cpu)
+int cpu__get_die_id(struct perf_cpu cpu)
 {
-	int value, ret = cpu__get_topology_int(cpu, "die_id", &value);
+	int value, ret = cpu__get_topology_int(cpu.cpu, "die_id", &value);
 
 	return ret ?: value;
 }
 
-struct aggr_cpu_id aggr_cpu_id__die(int cpu, void *data)
+struct aggr_cpu_id aggr_cpu_id__die(struct perf_cpu cpu, void *data)
 {
 	struct aggr_cpu_id id;
 	int die;
@@ -231,13 +232,13 @@ struct aggr_cpu_id aggr_cpu_id__die(int cpu, void *data)
 	return id;
 }
 
-int cpu__get_core_id(int cpu)
+int cpu__get_core_id(struct perf_cpu cpu)
 {
-	int value, ret = cpu__get_topology_int(cpu, "core_id", &value);
+	int value, ret = cpu__get_topology_int(cpu.cpu, "core_id", &value);
 	return ret ?: value;
 }
 
-struct aggr_cpu_id aggr_cpu_id__core(int cpu, void *data)
+struct aggr_cpu_id aggr_cpu_id__core(struct perf_cpu cpu, void *data)
 {
 	struct aggr_cpu_id id;
 	int core = cpu__get_core_id(cpu);
@@ -256,7 +257,7 @@ struct aggr_cpu_id aggr_cpu_id__core(int cpu, void *data)
 
 }
 
-struct aggr_cpu_id aggr_cpu_id__cpu(int cpu, void *data)
+struct aggr_cpu_id aggr_cpu_id__cpu(struct perf_cpu cpu, void *data)
 {
 	struct aggr_cpu_id id;
 
@@ -270,7 +271,7 @@ struct aggr_cpu_id aggr_cpu_id__cpu(int cpu, void *data)
 
 }
 
-struct aggr_cpu_id aggr_cpu_id__node(int cpu, void *data __maybe_unused)
+struct aggr_cpu_id aggr_cpu_id__node(struct perf_cpu cpu, void *data __maybe_unused)
 {
 	struct aggr_cpu_id id = aggr_cpu_id__empty();
 
@@ -318,8 +319,8 @@ static void set_max_cpu_num(void)
 	int ret = -1;
 
 	/* set up default */
-	max_cpu_num = 4096;
-	max_present_cpu_num = 4096;
+	max_cpu_num.cpu = 4096;
+	max_present_cpu_num.cpu = 4096;
 
 	mnt = sysfs__mountpoint();
 	if (!mnt)
@@ -332,7 +333,7 @@ static void set_max_cpu_num(void)
 		goto out;
 	}
 
-	ret = get_max_num(path, &max_cpu_num);
+	ret = get_max_num(path, &max_cpu_num.cpu);
 	if (ret)
 		goto out;
 
@@ -343,11 +344,11 @@ static void set_max_cpu_num(void)
 		goto out;
 	}
 
-	ret = get_max_num(path, &max_present_cpu_num);
+	ret = get_max_num(path, &max_present_cpu_num.cpu);
 
 out:
 	if (ret)
-		pr_err("Failed to read max cpus, using default of %d\n", max_cpu_num);
+		pr_err("Failed to read max cpus, using default of %d\n", max_cpu_num.cpu);
 }
 
 /* Determine highest possible node in the system for sparse allocation */
@@ -386,31 +387,31 @@ int cpu__max_node(void)
 	return max_node_num;
 }
 
-int cpu__max_cpu(void)
+struct perf_cpu cpu__max_cpu(void)
 {
-	if (unlikely(!max_cpu_num))
+	if (unlikely(!max_cpu_num.cpu))
 		set_max_cpu_num();
 
 	return max_cpu_num;
 }
 
-int cpu__max_present_cpu(void)
+struct perf_cpu cpu__max_present_cpu(void)
 {
-	if (unlikely(!max_present_cpu_num))
+	if (unlikely(!max_present_cpu_num.cpu))
 		set_max_cpu_num();
 
 	return max_present_cpu_num;
 }
 
 
-int cpu__get_node(int cpu)
+int cpu__get_node(struct perf_cpu cpu)
 {
 	if (unlikely(cpunode_map == NULL)) {
 		pr_debug("cpu_map not initialized\n");
 		return -1;
 	}
 
-	return cpunode_map[cpu];
+	return cpunode_map[cpu.cpu];
 }
 
 static int init_cpunode_map(void)
@@ -420,13 +421,13 @@ static int init_cpunode_map(void)
 	set_max_cpu_num();
 	set_max_node_num();
 
-	cpunode_map = calloc(max_cpu_num, sizeof(int));
+	cpunode_map = calloc(max_cpu_num.cpu, sizeof(int));
 	if (!cpunode_map) {
 		pr_err("%s: calloc failed\n", __func__);
 		return -1;
 	}
 
-	for (i = 0; i < max_cpu_num; i++)
+	for (i = 0; i < max_cpu_num.cpu; i++)
 		cpunode_map[i] = -1;
 
 	return 0;
@@ -487,35 +488,37 @@ int cpu__setup_cpunode_map(void)
 
 size_t cpu_map__snprint(struct perf_cpu_map *map, char *buf, size_t size)
 {
-	int i, cpu, start = -1;
+	int i, start = -1;
 	bool first = true;
 	size_t ret = 0;
 
 #define COMMA first ? "" : ","
 
 	for (i = 0; i < map->nr + 1; i++) {
+		struct perf_cpu cpu = { .cpu = INT_MAX };
 		bool last = i == map->nr;
 
-		cpu = last ? INT_MAX : map->map[i];
+		if (!last)
+			cpu = map->map[i];
 
 		if (start == -1) {
 			start = i;
 			if (last) {
 				ret += snprintf(buf + ret, size - ret,
 						"%s%d", COMMA,
-						map->map[i]);
+						map->map[i].cpu);
 			}
-		} else if (((i - start) != (cpu - map->map[start])) || last) {
+		} else if (((i - start) != (cpu.cpu - map->map[start].cpu)) || last) {
 			int end = i - 1;
 
 			if (start == end) {
 				ret += snprintf(buf + ret, size - ret,
 						"%s%d", COMMA,
-						map->map[start]);
+						map->map[start].cpu);
 			} else {
 				ret += snprintf(buf + ret, size - ret,
 						"%s%d-%d", COMMA,
-						map->map[start], map->map[end]);
+						map->map[start].cpu, map->map[end].cpu);
 			}
 			first = false;
 			start = i;
@@ -542,23 +545,23 @@ size_t cpu_map__snprint_mask(struct perf_cpu_map *map, char *buf, size_t size)
 	int i, cpu;
 	char *ptr = buf;
 	unsigned char *bitmap;
-	int last_cpu = perf_cpu_map__cpu(map, map->nr - 1);
+	struct perf_cpu last_cpu = perf_cpu_map__cpu(map, map->nr - 1);
 
 	if (buf == NULL)
 		return 0;
 
-	bitmap = zalloc(last_cpu / 8 + 1);
+	bitmap = zalloc(last_cpu.cpu / 8 + 1);
 	if (bitmap == NULL) {
 		buf[0] = '\0';
 		return 0;
 	}
 
 	for (i = 0; i < map->nr; i++) {
-		cpu = perf_cpu_map__cpu(map, i);
+		cpu = perf_cpu_map__cpu(map, i).cpu;
 		bitmap[cpu / 8] |= 1 << (cpu % 8);
 	}
 
-	for (cpu = last_cpu / 4 * 4; cpu >= 0; cpu -= 4) {
+	for (cpu = last_cpu.cpu / 4 * 4; cpu >= 0; cpu -= 4) {
 		unsigned char bits = bitmap[cpu / 8];
 
 		if (cpu % 8)
@@ -594,7 +597,7 @@ bool aggr_cpu_id__equal(const struct aggr_cpu_id *a, const struct aggr_cpu_id *b
 		a->socket == b->socket &&
 		a->die == b->die &&
 		a->core == b->core &&
-		a->cpu == b->cpu;
+		a->cpu.cpu == b->cpu.cpu;
 }
 
 bool aggr_cpu_id__is_empty(const struct aggr_cpu_id *a)
@@ -604,7 +607,7 @@ bool aggr_cpu_id__is_empty(const struct aggr_cpu_id *a)
 		a->socket == -1 &&
 		a->die == -1 &&
 		a->core == -1 &&
-		a->cpu == -1;
+		a->cpu.cpu == -1;
 }
 
 struct aggr_cpu_id aggr_cpu_id__empty(void)
@@ -615,7 +618,7 @@ struct aggr_cpu_id aggr_cpu_id__empty(void)
 		.socket = -1,
 		.die = -1,
 		.core = -1,
-		.cpu = -1
+		.cpu = (struct perf_cpu){ .cpu = -1 },
 	};
 	return ret;
 }
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index b98cd1739677..afc15027d678 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -23,7 +23,7 @@ struct aggr_cpu_id {
 	/** The core id as read from /sys/devices/system/cpu/cpuX/topology/core_id. */
 	int core;
 	/** CPU aggregation, note there is one CPU for each SMT thread. */
-	int cpu;
+	struct perf_cpu cpu;
 };
 
 /** A collection of aggr_cpu_id values, the "built" version is sorted and uniqued. */
@@ -48,28 +48,28 @@ const struct perf_cpu_map *cpu_map__online(void); /* thread unsafe */
 int cpu__setup_cpunode_map(void);
 
 int cpu__max_node(void);
-int cpu__max_cpu(void);
-int cpu__max_present_cpu(void);
+struct perf_cpu cpu__max_cpu(void);
+struct perf_cpu cpu__max_present_cpu(void);
 /**
  * cpu__get_node - Returns the numa node X as read from
  * /sys/devices/system/node/nodeX for the given CPU.
  */
-int cpu__get_node(int cpu);
+int cpu__get_node(struct perf_cpu cpu);
 /**
  * cpu__get_socket_id - Returns the socket number as read from
  * /sys/devices/system/cpu/cpuX/topology/physical_package_id for the given CPU.
  */
-int cpu__get_socket_id(int cpu);
+int cpu__get_socket_id(struct perf_cpu cpu);
 /**
  * cpu__get_die_id - Returns the die id as read from
  * /sys/devices/system/cpu/cpuX/topology/die_id for the given CPU.
  */
-int cpu__get_die_id(int cpu);
+int cpu__get_die_id(struct perf_cpu cpu);
 /**
  * cpu__get_core_id - Returns the core id as read from
  * /sys/devices/system/cpu/cpuX/topology/core_id for the given CPU.
  */
-int cpu__get_core_id(int cpu);
+int cpu__get_core_id(struct perf_cpu cpu);
 
 /**
  * cpu_aggr_map__empty_new - Create a cpu_aggr_map of size nr with every entry
@@ -77,7 +77,7 @@ int cpu__get_core_id(int cpu);
  */
 struct cpu_aggr_map *cpu_aggr_map__empty_new(int nr);
 
-typedef struct aggr_cpu_id (*aggr_cpu_id_get_t)(int cpu, void *data);
+typedef struct aggr_cpu_id (*aggr_cpu_id_get_t)(struct perf_cpu cpu, void *data);
 
 /**
  * cpu_aggr_map__new - Create a cpu_aggr_map with an aggr_cpu_id for each cpu in
@@ -98,29 +98,29 @@ struct aggr_cpu_id aggr_cpu_id__empty(void);
  * the socket for cpu. The function signature is compatible with
  * aggr_cpu_id_get_t.
  */
-struct aggr_cpu_id aggr_cpu_id__socket(int cpu, void *data);
+struct aggr_cpu_id aggr_cpu_id__socket(struct perf_cpu cpu, void *data);
 /**
  * aggr_cpu_id__die - Create an aggr_cpu_id with the die and socket populated
  * with the die and socket for cpu. The function signature is compatible with
  * aggr_cpu_id_get_t.
  */
-struct aggr_cpu_id aggr_cpu_id__die(int cpu, void *data);
+struct aggr_cpu_id aggr_cpu_id__die(struct perf_cpu cpu, void *data);
 /**
  * aggr_cpu_id__core - Create an aggr_cpu_id with the core, die and socket
  * populated with the core, die and socket for cpu. The function signature is
  * compatible with aggr_cpu_id_get_t.
  */
-struct aggr_cpu_id aggr_cpu_id__core(int cpu, void *data);
+struct aggr_cpu_id aggr_cpu_id__core(struct perf_cpu cpu, void *data);
 /**
  * aggr_cpu_id__core - Create an aggr_cpu_id with the cpu, core, die and socket
  * populated with the cpu, core, die and socket for cpu. The function signature
  * is compatible with aggr_cpu_id_get_t.
  */
-struct aggr_cpu_id aggr_cpu_id__cpu(int cpu, void *data);
+struct aggr_cpu_id aggr_cpu_id__cpu(struct perf_cpu cpu, void *data);
 /**
  * aggr_cpu_id__node - Create an aggr_cpu_id with the numa node populated for
  * cpu. The function signature is compatible with aggr_cpu_id_get_t.
  */
-struct aggr_cpu_id aggr_cpu_id__node(int cpu, void *data);
+struct aggr_cpu_id aggr_cpu_id__node(struct perf_cpu cpu, void *data);
 
 #endif /* __PERF_CPUMAP_H */
diff --git a/tools/perf/util/cputopo.c b/tools/perf/util/cputopo.c
index 8affb37d90e7..84ca106a3246 100644
--- a/tools/perf/util/cputopo.c
+++ b/tools/perf/util/cputopo.c
@@ -187,7 +187,7 @@ struct cpu_topology *cpu_topology__new(void)
 	struct perf_cpu_map *map;
 	bool has_die = has_die_topology();
 
-	ncpus = cpu__max_present_cpu();
+	ncpus = cpu__max_present_cpu().cpu;
 
 	/* build online CPU map */
 	map = perf_cpu_map__new(NULL);
@@ -218,7 +218,7 @@ struct cpu_topology *cpu_topology__new(void)
 	tp->core_cpus_list = addr;
 
 	for (i = 0; i < nr; i++) {
-		if (!perf_cpu_map__has(map, i))
+		if (!perf_cpu_map__has(map, (struct perf_cpu){ .cpu = i }))
 			continue;
 
 		ret = build_cpu_topology(tp, i);
@@ -333,7 +333,7 @@ struct numa_topology *numa_topology__new(void)
 	tp->nr = nr;
 
 	for (i = 0; i < nr; i++) {
-		if (load_numa_node(&tp->nodes[i], node_map->map[i])) {
+		if (load_numa_node(&tp->nodes[i], node_map->map[i].cpu)) {
 			numa_topology__delete(tp);
 			tp = NULL;
 			break;
diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index fd12c0dcaefb..579e44c59914 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -285,13 +285,13 @@ out_enomem:
 
 int perf_env__read_cpu_topology_map(struct perf_env *env)
 {
-	int cpu, nr_cpus;
+	int idx, nr_cpus;
 
 	if (env->cpu != NULL)
 		return 0;
 
 	if (env->nr_cpus_avail == 0)
-		env->nr_cpus_avail = cpu__max_present_cpu();
+		env->nr_cpus_avail = cpu__max_present_cpu().cpu;
 
 	nr_cpus = env->nr_cpus_avail;
 	if (nr_cpus == -1)
@@ -301,10 +301,12 @@ int perf_env__read_cpu_topology_map(struct perf_env *env)
 	if (env->cpu == NULL)
 		return -ENOMEM;
 
-	for (cpu = 0; cpu < nr_cpus; ++cpu) {
-		env->cpu[cpu].core_id	= cpu__get_core_id(cpu);
-		env->cpu[cpu].socket_id	= cpu__get_socket_id(cpu);
-		env->cpu[cpu].die_id	= cpu__get_die_id(cpu);
+	for (idx = 0; idx < nr_cpus; ++idx) {
+		struct perf_cpu cpu = { .cpu = idx };
+
+		env->cpu[idx].core_id	= cpu__get_core_id(cpu);
+		env->cpu[idx].socket_id	= cpu__get_socket_id(cpu);
+		env->cpu[idx].die_id	= cpu__get_die_id(cpu);
 	}
 
 	env->nr_cpus_avail = nr_cpus;
@@ -381,7 +383,7 @@ static int perf_env__read_arch(struct perf_env *env)
 static int perf_env__read_nr_cpus_avail(struct perf_env *env)
 {
 	if (env->nr_cpus_avail == 0)
-		env->nr_cpus_avail = cpu__max_present_cpu();
+		env->nr_cpus_avail = cpu__max_present_cpu().cpu;
 
 	return env->nr_cpus_avail ? 0 : -ENOENT;
 }
@@ -487,7 +489,7 @@ const char *perf_env__pmu_mappings(struct perf_env *env)
 	return env->pmu_mappings;
 }
 
-int perf_env__numa_node(struct perf_env *env, int cpu)
+int perf_env__numa_node(struct perf_env *env, struct perf_cpu cpu)
 {
 	if (!env->nr_numa_map) {
 		struct numa_node *nn;
@@ -495,7 +497,7 @@ int perf_env__numa_node(struct perf_env *env, int cpu)
 
 		for (i = 0; i < env->nr_numa_nodes; i++) {
 			nn = &env->numa_nodes[i];
-			nr = max(nr, perf_cpu_map__max(nn->map));
+			nr = max(nr, perf_cpu_map__max(nn->map).cpu);
 		}
 
 		nr++;
@@ -514,13 +516,14 @@ int perf_env__numa_node(struct perf_env *env, int cpu)
 		env->nr_numa_map = nr;
 
 		for (i = 0; i < env->nr_numa_nodes; i++) {
-			int tmp, j;
+			struct perf_cpu tmp;
+			int j;
 
 			nn = &env->numa_nodes[i];
-			perf_cpu_map__for_each_cpu(j, tmp, nn->map)
-				env->numa_map[j] = i;
+			perf_cpu_map__for_each_cpu(tmp, j, nn->map)
+				env->numa_map[tmp.cpu] = i;
 		}
 	}
 
-	return cpu >= 0 && cpu < env->nr_numa_map ? env->numa_map[cpu] : -1;
+	return cpu.cpu >= 0 && cpu.cpu < env->nr_numa_map ? env->numa_map[cpu.cpu] : -1;
 }
diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h
index 163e5ec503a2..a3541f98e1fc 100644
--- a/tools/perf/util/env.h
+++ b/tools/perf/util/env.h
@@ -4,6 +4,7 @@
 
 #include <linux/types.h>
 #include <linux/rbtree.h>
+#include "cpumap.h"
 #include "rwsem.h"
 
 struct perf_cpu_map;
@@ -170,5 +171,5 @@ struct bpf_prog_info_node *perf_env__find_bpf_prog_info(struct perf_env *env,
 bool perf_env__insert_btf(struct perf_env *env, struct btf_node *btf_node);
 struct btf_node *perf_env__find_btf(struct perf_env *env, __u32 btf_id);
 
-int perf_env__numa_node(struct perf_env *env, int cpu);
+int perf_env__numa_node(struct perf_env *env, struct perf_cpu cpu);
 #endif /* __PERF_ENV_H */
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 39d294f6c321..11eb95b2106b 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -350,13 +350,13 @@ struct evlist_cpu_iterator evlist__cpu_begin(struct evlist *evlist, struct affin
 		.cpu_map_idx = 0,
 		.evlist_cpu_map_idx = 0,
 		.evlist_cpu_map_nr = perf_cpu_map__nr(evlist->core.all_cpus),
-		.cpu = -1,
+		.cpu = (struct perf_cpu){ .cpu = -1},
 		.affinity = affinity,
 	};
 
 	if (itr.affinity) {
 		itr.cpu = perf_cpu_map__cpu(evlist->core.all_cpus, 0);
-		affinity__set(itr.affinity, itr.cpu);
+		affinity__set(itr.affinity, itr.cpu.cpu);
 		itr.cpu_map_idx = perf_cpu_map__idx(itr.evsel->core.cpus, itr.cpu);
 		/*
 		 * If this CPU isn't in the evsel's cpu map then advance through
@@ -385,7 +385,7 @@ void evlist_cpu_iterator__next(struct evlist_cpu_iterator *evlist_cpu_itr)
 			perf_cpu_map__cpu(evlist_cpu_itr->container->core.all_cpus,
 					  evlist_cpu_itr->evlist_cpu_map_idx);
 		if (evlist_cpu_itr->affinity)
-			affinity__set(evlist_cpu_itr->affinity, evlist_cpu_itr->cpu);
+			affinity__set(evlist_cpu_itr->affinity, evlist_cpu_itr->cpu.cpu);
 		evlist_cpu_itr->cpu_map_idx =
 			perf_cpu_map__idx(evlist_cpu_itr->evsel->core.cpus,
 					  evlist_cpu_itr->cpu);
@@ -819,7 +819,7 @@ perf_evlist__mmap_cb_get(struct perf_evlist *_evlist, bool overwrite, int idx)
 
 static int
 perf_evlist__mmap_cb_mmap(struct perf_mmap *_map, struct perf_mmap_param *_mp,
-			  int output, int cpu)
+			  int output, struct perf_cpu cpu)
 {
 	struct mmap *map = container_of(_map, struct mmap, core);
 	struct mmap_params *mp = container_of(_mp, struct mmap_params, core);
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 57828ebfcb61..64cba56fbc74 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -344,7 +344,7 @@ struct evlist_cpu_iterator {
 	/** The number of CPU map entries in evlist->core.all_cpus. */
 	int evlist_cpu_map_nr;
 	/** The current CPU of the iterator. */
-	int cpu;
+	struct perf_cpu cpu;
 	/** If present, used to set the affinity when switching between CPUs. */
 	struct affinity *affinity;
 };
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 796923c80ff6..7660e0bf3b50 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1594,7 +1594,7 @@ int __evsel__read_on_cpu(struct evsel *evsel, int cpu_map_idx, int thread, bool
 static int evsel__match_other_cpu(struct evsel *evsel, struct evsel *other,
 				  int cpu_map_idx)
 {
-	int cpu;
+	struct perf_cpu cpu;
 
 	cpu = perf_cpu_map__cpu(evsel->core.cpus, cpu_map_idx);
 	return perf_cpu_map__idx(other->core.cpus, cpu);
@@ -2020,9 +2020,9 @@ retry_open:
 			test_attr__ready();
 
 			pr_debug2_peo("sys_perf_event_open: pid %d  cpu %d  group_fd %d  flags %#lx",
-				pid, cpus->map[idx], group_fd, evsel->open_flags);
+				pid, cpus->map[idx].cpu, group_fd, evsel->open_flags);
 
-			fd = sys_perf_event_open(&evsel->core.attr, pid, cpus->map[idx],
+			fd = sys_perf_event_open(&evsel->core.attr, pid, cpus->map[idx].cpu,
 						group_fd, evsel->open_flags);
 
 			FD(evsel, idx, thread) = fd;
diff --git a/tools/perf/util/expr.c b/tools/perf/util/expr.c
index 666b59baeb70..e808738493e2 100644
--- a/tools/perf/util/expr.c
+++ b/tools/perf/util/expr.c
@@ -410,7 +410,7 @@ double expr__get_literal(const char *literal)
 		return smt_on() > 0 ? 1.0 : 0.0;
 
 	if (!strcmp("#num_cpus", literal))
-		return cpu__max_present_cpu();
+		return cpu__max_present_cpu().cpu;
 
 	/*
 	 * Assume that topology strings are consistent, such as CPUs "0-1"
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index e3c1a532d059..6da12e522edc 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -472,7 +472,7 @@ static int write_nrcpus(struct feat_fd *ff,
 	u32 nrc, nra;
 	int ret;
 
-	nrc = cpu__max_present_cpu();
+	nrc = cpu__max_present_cpu().cpu;
 
 	nr = sysconf(_SC_NPROCESSORS_ONLN);
 	if (nr < 0)
@@ -1163,7 +1163,7 @@ static int build_caches(struct cpu_cache_level caches[], u32 *cntp)
 	u32 nr, cpu;
 	u16 level;
 
-	nr = cpu__max_cpu();
+	nr = cpu__max_cpu().cpu;
 
 	for (cpu = 0; cpu < nr; cpu++) {
 		for (level = 0; level < MAX_CACHE_LVL; level++) {
@@ -1195,7 +1195,7 @@ static int build_caches(struct cpu_cache_level caches[], u32 *cntp)
 static int write_cache(struct feat_fd *ff,
 		       struct evlist *evlist __maybe_unused)
 {
-	u32 max_caches = cpu__max_cpu() * MAX_CACHE_LVL;
+	u32 max_caches = cpu__max_cpu().cpu * MAX_CACHE_LVL;
 	struct cpu_cache_level caches[max_caches];
 	u32 cnt = 0, i, version = 1;
 	int ret;
diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
index 23ecdba9e670..12261ed8c15b 100644
--- a/tools/perf/util/mmap.c
+++ b/tools/perf/util/mmap.c
@@ -94,7 +94,7 @@ static void perf_mmap__aio_free(struct mmap *map, int idx)
 	}
 }
 
-static int perf_mmap__aio_bind(struct mmap *map, int idx, int cpu, int affinity)
+static int perf_mmap__aio_bind(struct mmap *map, int idx, struct perf_cpu cpu, int affinity)
 {
 	void *data;
 	size_t mmap_len;
@@ -138,7 +138,7 @@ static void perf_mmap__aio_free(struct mmap *map, int idx)
 }
 
 static int perf_mmap__aio_bind(struct mmap *map __maybe_unused, int idx __maybe_unused,
-		int cpu __maybe_unused, int affinity __maybe_unused)
+		struct perf_cpu cpu __maybe_unused, int affinity __maybe_unused)
 {
 	return 0;
 }
@@ -240,7 +240,8 @@ void mmap__munmap(struct mmap *map)
 
 static void build_node_mask(int node, struct mmap_cpu_mask *mask)
 {
-	int c, cpu, nr_cpus;
+	int idx, nr_cpus;
+	struct perf_cpu cpu;
 	const struct perf_cpu_map *cpu_map = NULL;
 
 	cpu_map = cpu_map__online();
@@ -248,16 +249,16 @@ static void build_node_mask(int node, struct mmap_cpu_mask *mask)
 		return;
 
 	nr_cpus = perf_cpu_map__nr(cpu_map);
-	for (c = 0; c < nr_cpus; c++) {
-		cpu = cpu_map->map[c]; /* map c index to online cpu index */
+	for (idx = 0; idx < nr_cpus; idx++) {
+		cpu = cpu_map->map[idx]; /* map c index to online cpu index */
 		if (cpu__get_node(cpu) == node)
-			set_bit(cpu, mask->bits);
+			set_bit(cpu.cpu, mask->bits);
 	}
 }
 
 static int perf_mmap__setup_affinity_mask(struct mmap *map, struct mmap_params *mp)
 {
-	map->affinity_mask.nbits = cpu__max_cpu();
+	map->affinity_mask.nbits = cpu__max_cpu().cpu;
 	map->affinity_mask.bits = bitmap_zalloc(map->affinity_mask.nbits);
 	if (!map->affinity_mask.bits)
 		return -1;
@@ -265,12 +266,12 @@ static int perf_mmap__setup_affinity_mask(struct mmap *map, struct mmap_params *
 	if (mp->affinity == PERF_AFFINITY_NODE && cpu__max_node() > 1)
 		build_node_mask(cpu__get_node(map->core.cpu), &map->affinity_mask);
 	else if (mp->affinity == PERF_AFFINITY_CPU)
-		set_bit(map->core.cpu, map->affinity_mask.bits);
+		set_bit(map->core.cpu.cpu, map->affinity_mask.bits);
 
 	return 0;
 }
 
-int mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, int cpu)
+int mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, struct perf_cpu cpu)
 {
 	if (perf_mmap__mmap(&map->core, &mp->core, fd, cpu)) {
 		pr_debug2("failed to mmap perf event ring buffer, error %d\n",
diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h
index 8e259b9610f8..83f6bd4d4082 100644
--- a/tools/perf/util/mmap.h
+++ b/tools/perf/util/mmap.h
@@ -7,6 +7,7 @@
 #include <linux/types.h>
 #include <linux/ring_buffer.h>
 #include <linux/bitops.h>
+#include <perf/cpumap.h>
 #include <stdbool.h>
 #include <pthread.h> // for cpu_set_t
 #ifdef HAVE_AIO_SUPPORT
@@ -52,7 +53,7 @@ struct mmap_params {
 	struct auxtrace_mmap_params auxtrace_mp;
 };
 
-int mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, int cpu);
+int mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, struct perf_cpu cpu);
 void mmap__munmap(struct mmap *map);
 
 union perf_event *perf_mmap__read_forward(struct mmap *map);
diff --git a/tools/perf/util/perf_api_probe.c b/tools/perf/util/perf_api_probe.c
index 020411682a3c..734d006d9a8c 100644
--- a/tools/perf/util/perf_api_probe.c
+++ b/tools/perf/util/perf_api_probe.c
@@ -11,7 +11,7 @@
 
 typedef void (*setup_probe_fn_t)(struct evsel *evsel);
 
-static int perf_do_probe_api(setup_probe_fn_t fn, int cpu, const char *str)
+static int perf_do_probe_api(setup_probe_fn_t fn, struct perf_cpu cpu, const char *str)
 {
 	struct evlist *evlist;
 	struct evsel *evsel;
@@ -29,7 +29,7 @@ static int perf_do_probe_api(setup_probe_fn_t fn, int cpu, const char *str)
 	evsel = evlist__first(evlist);
 
 	while (1) {
-		fd = sys_perf_event_open(&evsel->core.attr, pid, cpu, -1, flags);
+		fd = sys_perf_event_open(&evsel->core.attr, pid, cpu.cpu, -1, flags);
 		if (fd < 0) {
 			if (pid == -1 && errno == EACCES) {
 				pid = 0;
@@ -43,7 +43,7 @@ static int perf_do_probe_api(setup_probe_fn_t fn, int cpu, const char *str)
 
 	fn(evsel);
 
-	fd = sys_perf_event_open(&evsel->core.attr, pid, cpu, -1, flags);
+	fd = sys_perf_event_open(&evsel->core.attr, pid, cpu.cpu, -1, flags);
 	if (fd < 0) {
 		if (errno == EINVAL)
 			err = -EINVAL;
@@ -61,7 +61,8 @@ static bool perf_probe_api(setup_probe_fn_t fn)
 {
 	const char *try[] = {"cycles:u", "instructions:u", "cpu-clock:u", NULL};
 	struct perf_cpu_map *cpus;
-	int cpu, ret, i = 0;
+	struct perf_cpu cpu;
+	int ret, i = 0;
 
 	cpus = perf_cpu_map__new(NULL);
 	if (!cpus)
@@ -136,15 +137,17 @@ bool perf_can_record_cpu_wide(void)
 		.exclude_kernel = 1,
 	};
 	struct perf_cpu_map *cpus;
-	int cpu, fd;
+	struct perf_cpu cpu;
+	int fd;
 
 	cpus = perf_cpu_map__new(NULL);
 	if (!cpus)
 		return false;
+
 	cpu = cpus->map[0];
 	perf_cpu_map__put(cpus);
 
-	fd = sys_perf_event_open(&attr, -1, cpu, -1, 0);
+	fd = sys_perf_event_open(&attr, -1, cpu.cpu, -1, 0);
 	if (fd < 0)
 		return false;
 	close(fd);
diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c
index 7f782a31bda3..95fb53899bcd 100644
--- a/tools/perf/util/python.c
+++ b/tools/perf/util/python.c
@@ -1057,7 +1057,7 @@ static struct mmap *get_md(struct evlist *evlist, int cpu)
 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
 		struct mmap *md = &evlist->mmap[i];
 
-		if (md->core.cpu == cpu)
+		if (md->core.cpu.cpu == cpu)
 			return md;
 	}
 
@@ -1443,7 +1443,7 @@ error:
  * Dummy, to avoid dragging all the test_attr infrastructure in the python
  * binding.
  */
-void test_attr__open(struct perf_event_attr *attr, pid_t pid, int cpu,
+void test_attr__open(struct perf_event_attr *attr, pid_t pid, struct perf_cpu cpu,
                      int fd, int group_fd, unsigned long flags)
 {
 }
diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c
index bff669b615ee..20461f174991 100644
--- a/tools/perf/util/record.c
+++ b/tools/perf/util/record.c
@@ -106,7 +106,7 @@ void evlist__config(struct evlist *evlist, struct record_opts *opts, struct call
 	if (opts->group)
 		evlist__set_leader(evlist);
 
-	if (evlist->core.cpus->map[0] < 0)
+	if (evlist->core.cpus->map[0].cpu < 0)
 		opts->no_inherit = true;
 
 	use_comm_exec = perf_can_comm_exec();
@@ -229,7 +229,8 @@ bool evlist__can_select_event(struct evlist *evlist, const char *str)
 {
 	struct evlist *temp_evlist;
 	struct evsel *evsel;
-	int err, fd, cpu;
+	int err, fd;
+	struct perf_cpu cpu = { .cpu = 0 };
 	bool ret = false;
 	pid_t pid = -1;
 
@@ -246,14 +247,16 @@ bool evlist__can_select_event(struct evlist *evlist, const char *str)
 	if (!evlist || perf_cpu_map__empty(evlist->core.cpus)) {
 		struct perf_cpu_map *cpus = perf_cpu_map__new(NULL);
 
-		cpu =  cpus ? cpus->map[0] : 0;
+		if (cpus)
+			cpu =  cpus->map[0];
+
 		perf_cpu_map__put(cpus);
 	} else {
 		cpu = evlist->core.cpus->map[0];
 	}
 
 	while (1) {
-		fd = sys_perf_event_open(&evsel->core.attr, pid, cpu, -1,
+		fd = sys_perf_event_open(&evsel->core.attr, pid, cpu.cpu, -1,
 					 perf_event_open_cloexec_flag());
 		if (fd < 0) {
 			if (pid == -1 && errno == EACCES) {
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index 0445bee9290f..bd95d60018a9 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -1555,7 +1555,7 @@ static void get_handler_name(char *str, size_t size,
 }
 
 static void
-process_stat(struct evsel *counter, int cpu, int thread, u64 tstamp,
+process_stat(struct evsel *counter, struct perf_cpu cpu, int thread, u64 tstamp,
 	     struct perf_counts_values *count)
 {
 	PyObject *handler, *t;
@@ -1575,7 +1575,7 @@ process_stat(struct evsel *counter, int cpu, int thread, u64 tstamp,
 		return;
 	}
 
-	PyTuple_SetItem(t, n++, _PyLong_FromLong(cpu));
+	PyTuple_SetItem(t, n++, _PyLong_FromLong(cpu.cpu));
 	PyTuple_SetItem(t, n++, _PyLong_FromLong(thread));
 
 	tuple_set_u64(t, n++, tstamp);
@@ -1599,7 +1599,7 @@ static void python_process_stat(struct perf_stat_config *config,
 	int cpu, thread;
 
 	if (config->aggr_mode == AGGR_GLOBAL) {
-		process_stat(counter, -1, -1, tstamp,
+		process_stat(counter, (struct perf_cpu){ .cpu = -1 }, -1, tstamp,
 			     &counter->counts->aggr);
 		return;
 	}
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index e1a273048681..f19348dddd55 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -2538,15 +2538,15 @@ int perf_session__cpu_bitmap(struct perf_session *session,
 	}
 
 	for (i = 0; i < map->nr; i++) {
-		int cpu = map->map[i];
+		struct perf_cpu cpu = map->map[i];
 
-		if (cpu >= nr_cpus) {
+		if (cpu.cpu >= nr_cpus) {
 			pr_err("Requested CPU %d too large. "
-			       "Consider raising MAX_NR_CPUS\n", cpu);
+			       "Consider raising MAX_NR_CPUS\n", cpu.cpu);
 			goto out_delete_map;
 		}
 
-		set_bit(cpu, cpu_bitmap);
+		set_bit(cpu.cpu, cpu_bitmap);
 	}
 
 	err = 0;
@@ -2598,7 +2598,7 @@ int perf_event__process_id_index(struct perf_session *session,
 		if (!sid)
 			return -ENOENT;
 		sid->idx = e->idx;
-		sid->cpu = e->cpu;
+		sid->cpu.cpu = e->cpu;
 		sid->tid = e->tid;
 	}
 	return 0;
diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index ba95379efcfb..5db83e51ceef 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -121,10 +121,10 @@ static void aggr_printout(struct perf_stat_config *config,
 				id.die,
 				config->csv_output ? 0 : -3,
 				id.core, config->csv_sep);
-		} else if (id.cpu > -1) {
+		} else if (id.cpu.cpu > -1) {
 			fprintf(config->output, "CPU%*d%s",
 				config->csv_output ? 0 : -7,
-				id.cpu, config->csv_sep);
+				id.cpu.cpu, config->csv_sep);
 		}
 		break;
 	case AGGR_THREAD:
@@ -331,7 +331,8 @@ static int first_shadow_cpu_map_idx(struct perf_stat_config *config,
 				struct evsel *evsel, const struct aggr_cpu_id *id)
 {
 	struct perf_cpu_map *cpus = evsel__cpus(evsel);
-	int cpu, idx;
+	struct perf_cpu cpu;
+	int idx;
 
 	if (config->aggr_mode == AGGR_NONE)
 		return perf_cpu_map__idx(cpus, id->cpu);
@@ -513,7 +514,8 @@ static void printout(struct perf_stat_config *config, struct aggr_cpu_id id, int
 static void aggr_update_shadow(struct perf_stat_config *config,
 			       struct evlist *evlist)
 {
-	int cpu, idx, s;
+	int idx, s;
+	struct perf_cpu cpu;
 	struct aggr_cpu_id s2, id;
 	u64 val;
 	struct evsel *counter;
@@ -633,7 +635,8 @@ static void aggr_cb(struct perf_stat_config *config,
 		    struct evsel *counter, void *data, bool first)
 {
 	struct aggr_data *ad = data;
-	int idx, cpu;
+	int idx;
+	struct perf_cpu cpu;
 	struct perf_cpu_map *cpus;
 	struct aggr_cpu_id s2;
 
@@ -666,7 +669,7 @@ static void aggr_cb(struct perf_stat_config *config,
 static void print_counter_aggrdata(struct perf_stat_config *config,
 				   struct evsel *counter, int s,
 				   char *prefix, bool metric_only,
-				   bool *first, int cpu)
+				   bool *first, struct perf_cpu cpu)
 {
 	struct aggr_data ad;
 	FILE *output = config->output;
@@ -696,7 +699,7 @@ static void print_counter_aggrdata(struct perf_stat_config *config,
 		fprintf(output, "%s", prefix);
 
 	uval = val * counter->scale;
-	if (cpu != -1)
+	if (cpu.cpu != -1)
 		id = aggr_cpu_id__cpu(cpu, /*data=*/NULL);
 
 	printout(config, id, nr, counter, uval,
@@ -731,8 +734,8 @@ static void print_aggr(struct perf_stat_config *config,
 		first = true;
 		evlist__for_each_entry(evlist, counter) {
 			print_counter_aggrdata(config, counter, s,
-					       prefix, metric_only,
-					       &first, /*cpu=*/-1);
+					prefix, metric_only,
+					&first, (struct perf_cpu){ .cpu = -1 });
 		}
 		if (metric_only)
 			fputc('\n', output);
@@ -893,7 +896,8 @@ static void print_counter(struct perf_stat_config *config,
 	FILE *output = config->output;
 	u64 ena, run, val;
 	double uval;
-	int idx, cpu;
+	int idx;
+	struct perf_cpu cpu;
 	struct aggr_cpu_id id;
 
 	perf_cpu_map__for_each_cpu(cpu, idx, evsel__cpus(counter)) {
@@ -921,7 +925,8 @@ static void print_no_aggr_metric(struct perf_stat_config *config,
 				 struct evlist *evlist,
 				 char *prefix)
 {
-	int all_idx, cpu;
+	int all_idx;
+	struct perf_cpu cpu;
 
 	perf_cpu_map__for_each_cpu(cpu, all_idx, evlist->core.cpus) {
 		struct evsel *counter;
@@ -1211,7 +1216,8 @@ static void print_percore_thread(struct perf_stat_config *config,
 	struct aggr_cpu_id s2, id;
 	struct perf_cpu_map *cpus;
 	bool first = true;
-	int idx, cpu;
+	int idx;
+	struct perf_cpu cpu;
 
 	cpus = evsel__cpus(counter);
 	perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
@@ -1247,8 +1253,8 @@ static void print_percore(struct perf_stat_config *config,
 			fprintf(output, "%s", prefix);
 
 		print_counter_aggrdata(config, counter, s,
-				       prefix, metric_only,
-				       &first, /*cpu=*/-1);
+				prefix, metric_only,
+				&first, (struct perf_cpu){ .cpu = -1 });
 	}
 
 	if (metric_only)
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index 7dbd7c4f3c33..ee6f03481215 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -297,7 +297,7 @@ static int check_per_pkg(struct evsel *counter, struct perf_counts_values *vals,
 {
 	struct hashmap *mask = counter->per_pkg_mask;
 	struct perf_cpu_map *cpus = evsel__cpus(counter);
-	int cpu = perf_cpu_map__cpu(cpus, cpu_map_idx);
+	struct perf_cpu cpu = perf_cpu_map__cpu(cpus, cpu_map_idx);
 	int s, d, ret = 0;
 	uint64_t *key;
 
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index 691c12fd8976..335d19cc3063 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -108,7 +108,7 @@ struct runtime_stat {
 	struct rblist value_list;
 };
 
-typedef struct aggr_cpu_id (*aggr_get_id_t)(struct perf_stat_config *config, int cpu);
+typedef struct aggr_cpu_id (*aggr_get_id_t)(struct perf_stat_config *config, struct perf_cpu cpu);
 
 struct perf_stat_config {
 	enum aggr_mode		 aggr_mode;
diff --git a/tools/perf/util/svghelper.c b/tools/perf/util/svghelper.c
index 96f941e01681..4c9f211249db 100644
--- a/tools/perf/util/svghelper.c
+++ b/tools/perf/util/svghelper.c
@@ -728,7 +728,7 @@ static int str_to_bitmap(char *s, cpumask_t *b, int nr_cpus)
 	int i;
 	int ret = 0;
 	struct perf_cpu_map *m;
-	int c;
+	struct perf_cpu c;
 
 	m = perf_cpu_map__new(s);
 	if (!m)
@@ -736,12 +736,12 @@ static int str_to_bitmap(char *s, cpumask_t *b, int nr_cpus)
 
 	for (i = 0; i < m->nr; i++) {
 		c = m->map[i];
-		if (c >= nr_cpus) {
+		if (c.cpu >= nr_cpus) {
 			ret = -1;
 			break;
 		}
 
-		set_bit(c, cpumask_bits(b));
+		set_bit(c.cpu, cpumask_bits(b));
 	}
 
 	perf_cpu_map__put(m);
diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c
index 198982109f0f..c9ba8050cc2b 100644
--- a/tools/perf/util/synthetic-events.c
+++ b/tools/perf/util/synthetic-events.c
@@ -1191,7 +1191,7 @@ static void synthesize_cpus(struct cpu_map_entries *cpus,
 	cpus->nr = map->nr;
 
 	for (i = 0; i < map->nr; i++)
-		cpus->cpu[i] = map->map[i];
+		cpus->cpu[i] = map->map[i].cpu;
 }
 
 static void synthesize_mask(struct perf_record_record_cpu_map *mask,
@@ -1203,7 +1203,7 @@ static void synthesize_mask(struct perf_record_record_cpu_map *mask,
 	mask->long_size = sizeof(long);
 
 	for (i = 0; i < map->nr; i++)
-		set_bit(map->map[i], mask->mask);
+		set_bit(map->map[i].cpu, mask->mask);
 }
 
 static size_t cpus_size(struct perf_cpu_map *map)
@@ -1219,7 +1219,7 @@ static size_t mask_size(struct perf_cpu_map *map, int *max)
 
 	for (i = 0; i < map->nr; i++) {
 		/* bit position of the cpu is + 1 */
-		int bit = map->map[i] + 1;
+		int bit = map->map[i].cpu + 1;
 
 		if (bit > *max)
 			*max = bit;
@@ -1354,7 +1354,7 @@ int perf_event__synthesize_stat_config(struct perf_tool *tool,
 }
 
 int perf_event__synthesize_stat(struct perf_tool *tool,
-				u32 cpu, u32 thread, u64 id,
+				struct perf_cpu cpu, u32 thread, u64 id,
 				struct perf_counts_values *count,
 				perf_event__handler_t process,
 				struct machine *machine)
@@ -1366,7 +1366,7 @@ int perf_event__synthesize_stat(struct perf_tool *tool,
 	event.header.misc = 0;
 
 	event.id        = id;
-	event.cpu       = cpu;
+	event.cpu       = cpu.cpu;
 	event.thread    = thread;
 	event.val       = count->val;
 	event.ena       = count->ena;
@@ -1763,7 +1763,7 @@ int perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_
 			}
 
 			e->idx = sid->idx;
-			e->cpu = sid->cpu;
+			e->cpu = sid->cpu.cpu;
 			e->tid = sid->tid;
 		}
 	}
diff --git a/tools/perf/util/synthetic-events.h b/tools/perf/util/synthetic-events.h
index c931433bacbf..78a0450db164 100644
--- a/tools/perf/util/synthetic-events.h
+++ b/tools/perf/util/synthetic-events.h
@@ -6,6 +6,7 @@
 #include <sys/types.h> // pid_t
 #include <linux/compiler.h>
 #include <linux/types.h>
+#include <perf/cpumap.h>
 
 struct auxtrace_record;
 struct dso;
@@ -63,7 +64,7 @@ int perf_event__synthesize_sample(union perf_event *event, u64 type, u64 read_fo
 int perf_event__synthesize_stat_config(struct perf_tool *tool, struct perf_stat_config *config, perf_event__handler_t process, struct machine *machine);
 int perf_event__synthesize_stat_events(struct perf_stat_config *config, struct perf_tool *tool, struct evlist *evlist, perf_event__handler_t process, bool attrs);
 int perf_event__synthesize_stat_round(struct perf_tool *tool, u64 time, u64 type, perf_event__handler_t process, struct machine *machine);
-int perf_event__synthesize_stat(struct perf_tool *tool, u32 cpu, u32 thread, u64 id, struct perf_counts_values *count, perf_event__handler_t process, struct machine *machine);
+int perf_event__synthesize_stat(struct perf_tool *tool, struct perf_cpu cpu, u32 thread, u64 id, struct perf_counts_values *count, perf_event__handler_t process, struct machine *machine);
 int perf_event__synthesize_thread_map2(struct perf_tool *tool, struct perf_thread_map *threads, perf_event__handler_t process, struct machine *machine);
 int perf_event__synthesize_thread_map(struct perf_tool *tool, struct perf_thread_map *threads, perf_event__handler_t process, struct machine *machine, bool needs_mmap, bool mmap_data);
 int perf_event__synthesize_threads(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine, bool needs_mmap, bool mmap_data, unsigned int nr_threads_synthesize);
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index 9f0d36ba77f2..9443c29afa52 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -11,6 +11,9 @@
 #include <stddef.h>
 #include <linux/compiler.h>
 #include <sys/types.h>
+#ifndef __cplusplus
+#include <internal/cpumap.h>
+#endif
 
 /* General helper functions */
 void usage(const char *err) __noreturn;
@@ -66,6 +69,6 @@ extern bool test_attr__enabled;
 void test_attr__ready(void);
 void test_attr__init(void);
 struct perf_event_attr;
-void test_attr__open(struct perf_event_attr *attr, pid_t pid, int cpu,
+void test_attr__open(struct perf_event_attr *attr, pid_t pid, struct perf_cpu cpu,
 		     int fd, int group_fd, unsigned long flags);
 #endif /* GIT_COMPAT_UTIL_H */
-- 
cgit v1.2.3


From 0ce05781f4905fcfbbb489519e36be71c7b0bbcc Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 23 Nov 2021 16:12:30 -0800
Subject: perf tools: Fix SMT fallback with large core counts

strtoull can only read a 64-bit bitmap. On an AMD EPYC core_cpus may look
like:

00000000,00000000,00000000,00000001,00000000,00000000,00000000,00000001

and so the sibling wasn't spotted. Fix by writing a simple hweight string
parser.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20211124001231.3277836-3-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/smt.c | 68 +++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 58 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/smt.c b/tools/perf/util/smt.c
index 34f1b1b1176c..2636be65305a 100644
--- a/tools/perf/util/smt.c
+++ b/tools/perf/util/smt.c
@@ -5,6 +5,56 @@
 #include "api/fs/fs.h"
 #include "smt.h"
 
+/**
+ * hweight_str - Returns the number of bits set in str. Stops at first non-hex
+ *	       or ',' character.
+ */
+static int hweight_str(char *str)
+{
+	int result = 0;
+
+	while (*str) {
+		switch (*str++) {
+		case '0':
+		case ',':
+			break;
+		case '1':
+		case '2':
+		case '4':
+		case '8':
+			result++;
+			break;
+		case '3':
+		case '5':
+		case '6':
+		case '9':
+		case 'a':
+		case 'A':
+		case 'c':
+		case 'C':
+			result += 2;
+			break;
+		case '7':
+		case 'b':
+		case 'B':
+		case 'd':
+		case 'D':
+		case 'e':
+		case 'E':
+			result += 3;
+			break;
+		case 'f':
+		case 'F':
+			result += 4;
+			break;
+		default:
+			goto done;
+		}
+	}
+done:
+	return result;
+}
+
 int smt_on(void)
 {
 	static bool cached;
@@ -15,9 +65,12 @@ int smt_on(void)
 	if (cached)
 		return cached_result;
 
-	if (sysfs__read_int("devices/system/cpu/smt/active", &cached_result) >= 0)
-		goto done;
+	if (sysfs__read_int("devices/system/cpu/smt/active", &cached_result) >= 0) {
+		cached = true;
+		return cached_result;
+	}
 
+	cached_result = 0;
 	ncpu = sysconf(_SC_NPROCESSORS_CONF);
 	for (cpu = 0; cpu < ncpu; cpu++) {
 		unsigned long long siblings;
@@ -35,18 +88,13 @@ int smt_on(void)
 				continue;
 		}
 		/* Entry is hex, but does not have 0x, so need custom parser */
-		siblings = strtoull(str, NULL, 16);
+		siblings = hweight_str(str);
 		free(str);
-		if (hweight64(siblings) > 1) {
+		if (siblings > 1) {
 			cached_result = 1;
-			cached = true;
 			break;
 		}
 	}
-	if (!cached) {
-		cached_result = 0;
-done:
-		cached = true;
-	}
+	cached = true;
 	return cached_result;
 }
-- 
cgit v1.2.3


From 6dd8646939a770e4ec0220c1c19d6af25c5877b7 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 23 Nov 2021 16:12:31 -0800
Subject: perf tools: Probe non-deprecated sysfs path 1st

Following Documentation/ABI/stable/sysfs-devices-system-cpu the
/sys/devices/system/cpu/cpuX/topology/core_cpus is deprecated in favor
of thread_siblings, so probe thread_siblings before falling back on
core_cpus.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: https://lore.kernel.org/r/20211124001231.3277836-4-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/smt.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/smt.c b/tools/perf/util/smt.c
index 2636be65305a..2b0a36ebf27a 100644
--- a/tools/perf/util/smt.c
+++ b/tools/perf/util/smt.c
@@ -79,11 +79,10 @@ int smt_on(void)
 		char fn[256];
 
 		snprintf(fn, sizeof fn,
-			"devices/system/cpu/cpu%d/topology/core_cpus", cpu);
+			"devices/system/cpu/cpu%d/topology/thread_siblings", cpu);
 		if (sysfs__read_str(fn, &str, &strlen) < 0) {
 			snprintf(fn, sizeof fn,
-				"devices/system/cpu/cpu%d/topology/thread_siblings",
-				cpu);
+				"devices/system/cpu/cpu%d/topology/core_cpus", cpu);
 			if (sysfs__read_str(fn, &str, &strlen) < 0)
 				continue;
 		}
-- 
cgit v1.2.3


From f56ef30a31d388663e78b9be687d67748c9b7297 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 23 Nov 2021 16:12:28 -0800
Subject: perf expr: Add debug logging for literals

Useful for diagnosing problems with metrics.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lore.kernel.org/lkml/20211124001231.3277836-1-irogers@google.com
[ Fixed up perf_cpu conflict, i.e. we need to append ".cpu" to cpu__max_present_cpu() result ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/expr.c | 37 +++++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/expr.c b/tools/perf/util/expr.c
index e808738493e2..c94fb9bef919 100644
--- a/tools/perf/util/expr.c
+++ b/tools/perf/util/expr.c
@@ -405,12 +405,17 @@ double expr_id_data__source_count(const struct expr_id_data *data)
 double expr__get_literal(const char *literal)
 {
 	static struct cpu_topology *topology;
+	double result = NAN;
 
-	if (!strcmp("#smt_on", literal))
-		return smt_on() > 0 ? 1.0 : 0.0;
+	if (!strcmp("#smt_on", literal)) {
+		result = smt_on() > 0 ? 1.0 : 0.0;
+		goto out;
+	}
 
-	if (!strcmp("#num_cpus", literal))
-		return cpu__max_present_cpu().cpu;
+	if (!strcmp("#num_cpus", literal)) {
+		result = cpu__max_present_cpu().cpu;
+		goto out;
+	}
 
 	/*
 	 * Assume that topology strings are consistent, such as CPUs "0-1"
@@ -422,16 +427,24 @@ double expr__get_literal(const char *literal)
 		topology = cpu_topology__new();
 		if (!topology) {
 			pr_err("Error creating CPU topology");
-			return NAN;
+			goto out;
 		}
 	}
-	if (!strcmp("#num_packages", literal))
-		return topology->package_cpus_lists;
-	if (!strcmp("#num_dies", literal))
-		return topology->die_cpus_lists;
-	if (!strcmp("#num_cores", literal))
-		return topology->core_cpus_lists;
+	if (!strcmp("#num_packages", literal)) {
+		result = topology->package_cpus_lists;
+		goto out;
+	}
+	if (!strcmp("#num_dies", literal)) {
+		result = topology->die_cpus_lists;
+		goto out;
+	}
+	if (!strcmp("#num_cores", literal)) {
+		result = topology->core_cpus_lists;
+		goto out;
+	}
 
 	pr_err("Unrecognized literal '%s'", literal);
-	return NAN;
+out:
+	pr_debug2("literal: %s = %f\n", literal, result);
+	return result;
 }
-- 
cgit v1.2.3


From c0dd94558d0e473aa92254e1c48a47900c911e69 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 25 Nov 2021 23:13:05 -0800
Subject: perf pmu-events: Don't lower case MetricExpr

This patch changes MetricExpr to be written out in the same case. This
enables events in metrics to use modifiers like 'G' which currently
yield parse errors when made lower case. To keep tests passing the
literal #smt_on is compared in a non-case sensitive way - #SMT_on is
present in at least SkylakeX metrics.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Clarke <pc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lore.kernel.org/lkml/20211126071305.3733878-1-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/jevents.c | 2 --
 tools/perf/util/expr.c          | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/jevents.c b/tools/perf/pmu-events/jevents.c
index 2e7c4153875b..1a57c3f81dd4 100644
--- a/tools/perf/pmu-events/jevents.c
+++ b/tools/perf/pmu-events/jevents.c
@@ -672,8 +672,6 @@ static int json_events(const char *fn,
 				addfield(map, &je.metric_constraint, "", "", val);
 			} else if (json_streq(map, field, "MetricExpr")) {
 				addfield(map, &je.metric_expr, "", "", val);
-				for (s = je.metric_expr; *s; s++)
-					*s = tolower(*s);
 			} else if (json_streq(map, field, "ArchStdEvent")) {
 				addfield(map, &arch_std, "", "", val);
 				for (s = arch_std; *s; s++)
diff --git a/tools/perf/util/expr.c b/tools/perf/util/expr.c
index c94fb9bef919..675f318ce7c1 100644
--- a/tools/perf/util/expr.c
+++ b/tools/perf/util/expr.c
@@ -407,7 +407,7 @@ double expr__get_literal(const char *literal)
 	static struct cpu_topology *topology;
 	double result = NAN;
 
-	if (!strcmp("#smt_on", literal)) {
+	if (!strcasecmp("#smt_on", literal)) {
 		result = smt_on() > 0 ? 1.0 : 0.0;
 		goto out;
 	}
-- 
cgit v1.2.3


From de0e444706ed98f2a13275a988d8595184be6d5c Mon Sep 17 00:00:00 2001
From: Li Zhijian <lizhijian@fujitsu.com>
Date: Thu, 13 Jan 2022 15:28:59 +0800
Subject: kselftests/net: adapt the timeout to the largest runtime

timeout in settings is used by each case under the same directory, so
it should adapt to the maximum runtime.

A normally running net/fib_nexthops.sh may be killed by this unsuitable
timeout. Furthermore, since the defect[1] of kselftests framework,
net/fib_nexthops.sh which might take at least (300 * 4) seconds would
block the whole kselftests framework previously.
$ git grep -w 'sleep 300' tools/testing/selftests/net
tools/testing/selftests/net/fib_nexthops.sh:    sleep 300
tools/testing/selftests/net/fib_nexthops.sh:    sleep 300
tools/testing/selftests/net/fib_nexthops.sh:    sleep 300
tools/testing/selftests/net/fib_nexthops.sh:    sleep 300

Enlarge the timeout by plus 300 based on the obvious largest runtime
to avoid the blocking.

[1]: https://www.spinics.net/lists/kernel/msg4185370.html

Signed-off-by: Zhou Jie <zhoujie2011@fujitsu.com>
Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/settings | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/settings b/tools/testing/selftests/net/settings
index 694d70710ff0..dfc27cdc6c05 100644
--- a/tools/testing/selftests/net/settings
+++ b/tools/testing/selftests/net/settings
@@ -1 +1 @@
-timeout=300
+timeout=1500
-- 
cgit v1.2.3


From 35cb8c713a496e8c114eed5e2a5a30b359876df2 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sun, 9 May 2021 10:19:37 -0300
Subject: tools arch: Update arch/x86/lib/mem{cpy,set}_64.S copies used in
 'perf bench mem memcpy'

To bring in the change made in this cset:

  f94909ceb1ed4bfd ("x86: Prepare asm files for straight-line-speculation")

It silences these perf tools build warnings, no change in the tools:

  Warning: Kernel ABI header at 'tools/arch/x86/lib/memcpy_64.S' differs from latest version at 'arch/x86/lib/memcpy_64.S'
  diff -u tools/arch/x86/lib/memcpy_64.S arch/x86/lib/memcpy_64.S
  Warning: Kernel ABI header at 'tools/arch/x86/lib/memset_64.S' differs from latest version at 'arch/x86/lib/memset_64.S'
  diff -u tools/arch/x86/lib/memset_64.S arch/x86/lib/memset_64.S

The code generated was checked before and after using 'objdump -d /tmp/build/perf/bench/mem-memcpy-x86-64-asm.o',
no changes.

Cc: Borislav Petkov <bp@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/arch/x86/lib/memcpy_64.S | 12 ++++++------
 tools/arch/x86/lib/memset_64.S |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/arch/x86/lib/memcpy_64.S b/tools/arch/x86/lib/memcpy_64.S
index 1cc9da6e29c7..59cf2343f3d9 100644
--- a/tools/arch/x86/lib/memcpy_64.S
+++ b/tools/arch/x86/lib/memcpy_64.S
@@ -39,7 +39,7 @@ SYM_FUNC_START_WEAK(memcpy)
 	rep movsq
 	movl %edx, %ecx
 	rep movsb
-	ret
+	RET
 SYM_FUNC_END(memcpy)
 SYM_FUNC_END_ALIAS(__memcpy)
 EXPORT_SYMBOL(memcpy)
@@ -53,7 +53,7 @@ SYM_FUNC_START_LOCAL(memcpy_erms)
 	movq %rdi, %rax
 	movq %rdx, %rcx
 	rep movsb
-	ret
+	RET
 SYM_FUNC_END(memcpy_erms)
 
 SYM_FUNC_START_LOCAL(memcpy_orig)
@@ -137,7 +137,7 @@ SYM_FUNC_START_LOCAL(memcpy_orig)
 	movq %r9,	1*8(%rdi)
 	movq %r10,	-2*8(%rdi, %rdx)
 	movq %r11,	-1*8(%rdi, %rdx)
-	retq
+	RET
 	.p2align 4
 .Lless_16bytes:
 	cmpl $8,	%edx
@@ -149,7 +149,7 @@ SYM_FUNC_START_LOCAL(memcpy_orig)
 	movq -1*8(%rsi, %rdx),	%r9
 	movq %r8,	0*8(%rdi)
 	movq %r9,	-1*8(%rdi, %rdx)
-	retq
+	RET
 	.p2align 4
 .Lless_8bytes:
 	cmpl $4,	%edx
@@ -162,7 +162,7 @@ SYM_FUNC_START_LOCAL(memcpy_orig)
 	movl -4(%rsi, %rdx), %r8d
 	movl %ecx, (%rdi)
 	movl %r8d, -4(%rdi, %rdx)
-	retq
+	RET
 	.p2align 4
 .Lless_3bytes:
 	subl $1, %edx
@@ -180,7 +180,7 @@ SYM_FUNC_START_LOCAL(memcpy_orig)
 	movb %cl, (%rdi)
 
 .Lend:
-	retq
+	RET
 SYM_FUNC_END(memcpy_orig)
 
 .popsection
diff --git a/tools/arch/x86/lib/memset_64.S b/tools/arch/x86/lib/memset_64.S
index 9827ae267f96..d624f2bc42f1 100644
--- a/tools/arch/x86/lib/memset_64.S
+++ b/tools/arch/x86/lib/memset_64.S
@@ -40,7 +40,7 @@ SYM_FUNC_START(__memset)
 	movl %edx,%ecx
 	rep stosb
 	movq %r9,%rax
-	ret
+	RET
 SYM_FUNC_END(__memset)
 SYM_FUNC_END_ALIAS(memset)
 EXPORT_SYMBOL(memset)
@@ -63,7 +63,7 @@ SYM_FUNC_START_LOCAL(memset_erms)
 	movq %rdx,%rcx
 	rep stosb
 	movq %r9,%rax
-	ret
+	RET
 SYM_FUNC_END(memset_erms)
 
 SYM_FUNC_START_LOCAL(memset_orig)
@@ -125,7 +125,7 @@ SYM_FUNC_START_LOCAL(memset_orig)
 
 .Lende:
 	movq	%r10,%rax
-	ret
+	RET
 
 .Lbad_alignment:
 	cmpq $7,%rdx
-- 
cgit v1.2.3


From f1dcda0f79548c04f585108e2e165cb4fec951e8 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 3 May 2021 11:48:26 -0300
Subject: tools headers UAPI: Update tools's copy of drm.h header

Picking the changes from:

  43d5ac7d07023cd1 ("drm: document DRM_IOCTL_MODE_GETFB2")

It is just a comment, so no changes and silences these perf build warnings:

  Warning: Kernel ABI header at 'tools/include/uapi/drm/drm.h' differs from latest version at 'include/uapi/drm/drm.h'
  diff -u tools/include/uapi/drm/drm.h include/uapi/drm/drm.h

Cc: Simon Ser <contact@emersion.fr>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/include/uapi/drm/drm.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'tools')

diff --git a/tools/include/uapi/drm/drm.h b/tools/include/uapi/drm/drm.h
index 3b810b53ba8b..642808520d92 100644
--- a/tools/include/uapi/drm/drm.h
+++ b/tools/include/uapi/drm/drm.h
@@ -1096,6 +1096,24 @@ extern "C" {
 #define DRM_IOCTL_SYNCOBJ_TRANSFER	DRM_IOWR(0xCC, struct drm_syncobj_transfer)
 #define DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL	DRM_IOWR(0xCD, struct drm_syncobj_timeline_array)
 
+/**
+ * DRM_IOCTL_MODE_GETFB2 - Get framebuffer metadata.
+ *
+ * This queries metadata about a framebuffer. User-space fills
+ * &drm_mode_fb_cmd2.fb_id as the input, and the kernels fills the rest of the
+ * struct as the output.
+ *
+ * If the client is DRM master or has &CAP_SYS_ADMIN, &drm_mode_fb_cmd2.handles
+ * will be filled with GEM buffer handles. Planes are valid until one has a
+ * zero handle -- this can be used to compute the number of planes.
+ *
+ * Otherwise, &drm_mode_fb_cmd2.handles will be zeroed and planes are valid
+ * until one has a zero &drm_mode_fb_cmd2.pitches.
+ *
+ * If the framebuffer has a format modifier, &DRM_MODE_FB_MODIFIERS will be set
+ * in &drm_mode_fb_cmd2.flags and &drm_mode_fb_cmd2.modifier will contain the
+ * modifier. Otherwise, user-space must ignore &drm_mode_fb_cmd2.modifier.
+ */
 #define DRM_IOCTL_MODE_GETFB2		DRM_IOWR(0xCE, struct drm_mode_fb_cmd2)
 
 /*
-- 
cgit v1.2.3


From 486e5ed88827dabd295cd55f368d513ee8c30eb1 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 1 Jul 2021 13:39:15 -0300
Subject: tools headers cpufeatures: Sync with the kernel sources

To pick the changes from:

  d341db8f48ea4331 ("x86/cpufeatures: Add AMD Collaborative Processor Performance Control feature flag")

This only causes these perf files to be rebuilt:

  CC       /tmp/build/perf/bench/mem-memcpy-x86-64-asm.o
  CC       /tmp/build/perf/bench/mem-memset-x86-64-asm.o

And addresses this perf build warning:

  Warning: Kernel ABI header at 'tools/arch/x86/include/asm/cpufeatures.h' differs from latest version at 'arch/x86/include/asm/cpufeatures.h'
  diff -u tools/arch/x86/include/asm/cpufeatures.h arch/x86/include/asm/cpufeatures.h

Cc: Huang Rui <ray.huang@amd.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/arch/x86/include/asm/cpufeatures.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
index d5b5f2ab87a0..18de5f76f198 100644
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@@ -315,6 +315,7 @@
 #define X86_FEATURE_AMD_SSBD		(13*32+24) /* "" Speculative Store Bypass Disable */
 #define X86_FEATURE_VIRT_SSBD		(13*32+25) /* Virtualized Speculative Store Bypass Disable */
 #define X86_FEATURE_AMD_SSB_NO		(13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */
+#define X86_FEATURE_CPPC		(13*32+27) /* Collaborative Processor Performance Control */
 
 /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */
 #define X86_FEATURE_DTHERM		(14*32+ 0) /* Digital Thermal Sensor */
-- 
cgit v1.2.3


From 79ce8f43ac5ac16d6d63729e858da42b0460d87b Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@kernel.org>
Date: Fri, 10 Dec 2021 19:11:20 +0100
Subject: rtla: Real-Time Linux Analysis tool

The rtla is a meta-tool that includes a set of commands that aims
to analyze the real-time properties of Linux. But instead of testing
Linux as a black box, rtla leverages kernel tracing capabilities to
provide precise information about the properties and root causes of
unexpected results.

rtla --help works and provide information about the available options.

This is just the "main" and the Makefile, no function yet.

Link: https://lkml.kernel.org/r/bf9118ed43a09e6c054c9a491cbe7411ad1acd89.1639158831.git.bristot@kernel.org

Cc: Tao Zhou <tao.zhou@linux.dev>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Daniel Bristot de Oliveira <bristot@kernel.org>
Cc: linux-rt-users@vger.kernel.org
Cc: linux-trace-devel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 tools/tracing/rtla/Makefile   | 76 +++++++++++++++++++++++++++++++++++++++++++
 tools/tracing/rtla/README.txt | 36 ++++++++++++++++++++
 tools/tracing/rtla/src/rtla.c | 72 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 184 insertions(+)
 create mode 100644 tools/tracing/rtla/Makefile
 create mode 100644 tools/tracing/rtla/README.txt
 create mode 100644 tools/tracing/rtla/src/rtla.c

(limited to 'tools')

diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile
new file mode 100644
index 000000000000..56b1bdd60c96
--- /dev/null
+++ b/tools/tracing/rtla/Makefile
@@ -0,0 +1,76 @@
+NAME	:=	rtla
+VERSION	:=	0.5
+
+# From libtracefs:
+# Makefiles suck: This macro sets a default value of $(2) for the
+# variable named by $(1), unless the variable has been set by
+# environment or command line. This is necessary for CC and AR
+# because make sets default values, so the simpler ?= approach
+# won't work as expected.
+define allow-override
+  $(if $(or $(findstring environment,$(origin $(1))),\
+            $(findstring command line,$(origin $(1)))),,\
+    $(eval $(1) = $(2)))
+endef
+
+# Allow setting CC and AR, or setting CROSS_COMPILE as a prefix.
+$(call allow-override,CC,$(CROSS_COMPILE)gcc)
+$(call allow-override,AR,$(CROSS_COMPILE)ar)
+$(call allow-override,STRIP,$(CROSS_COMPILE)strip)
+$(call allow-override,PKG_CONFIG,pkg-config)
+$(call allow-override,LD_SO_CONF_PATH,/etc/ld.so.conf.d/)
+$(call allow-override,LDCONFIG,ldconfig)
+
+INSTALL	=	install
+FOPTS	:=	-flto=auto -ffat-lto-objects -fexceptions -fstack-protector-strong \
+		-fasynchronous-unwind-tables -fstack-clash-protection
+WOPTS	:= 	-Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -Wno-maybe-uninitialized
+
+TRACEFS_HEADERS	:= $$($(PKG_CONFIG) --cflags libtracefs)
+
+CFLAGS	:=	-O -g -DVERSION=\"$(VERSION)\" $(FOPTS) $(MOPTS) $(WOPTS) $(TRACEFS_HEADERS)
+LDFLAGS	:=	-ggdb
+LIBS	:=	$$($(PKG_CONFIG) --libs libtracefs) -lprocps
+
+SRC	:=	$(wildcard src/*.c)
+HDR	:=	$(wildcard src/*.h)
+OBJ	:=	$(SRC:.c=.o)
+DIRS	:=	src
+FILES	:=	Makefile README.txt
+CEXT	:=	bz2
+TARBALL	:=	$(NAME)-$(VERSION).tar.$(CEXT)
+TAROPTS	:=	-cvjf $(TARBALL)
+BINDIR	:=	/usr/bin
+DATADIR	:=	/usr/share
+DOCDIR	:=	$(DATADIR)/doc
+MANDIR	:=	$(DATADIR)/man
+LICDIR	:=	$(DATADIR)/licenses
+
+.PHONY:	all
+all:	rtla
+
+rtla: $(OBJ)
+	$(CC) -o rtla $(LDFLAGS) $(OBJ) $(LIBS)
+
+static: $(OBJ)
+	$(CC) -o rtla-static $(LDFLAGS) --static $(OBJ) $(LIBS) -lpthread -ldl
+
+.PHONY: install
+install:
+	$(INSTALL) -d -m 755 $(DESTDIR)$(BINDIR)
+	$(INSTALL) rtla -m 755 $(DESTDIR)$(BINDIR)
+	$(STRIP) $(DESTDIR)$(BINDIR)/rtla
+
+.PHONY: clean tarball
+clean:
+	@test ! -f rtla || rm rtla
+	@test ! -f rtla-static || rm rtla-static
+	@test ! -f src/rtla.o || rm src/rtla.o
+	@test ! -f $(TARBALL) || rm -f $(TARBALL)
+	@rm -rf *~ $(OBJ) *.tar.$(CEXT)
+
+tarball:  clean
+	rm -rf $(NAME)-$(VERSION) && mkdir $(NAME)-$(VERSION)
+	cp -r $(DIRS) $(FILES) $(NAME)-$(VERSION)
+	tar $(TAROPTS) --exclude='*~' $(NAME)-$(VERSION)
+	rm -rf $(NAME)-$(VERSION)
diff --git a/tools/tracing/rtla/README.txt b/tools/tracing/rtla/README.txt
new file mode 100644
index 000000000000..6c88446f7e74
--- /dev/null
+++ b/tools/tracing/rtla/README.txt
@@ -0,0 +1,36 @@
+RTLA: Real-Time Linux Analysis tools
+
+The rtla is a meta-tool that includes a set of commands that
+aims to analyze the real-time properties of Linux. But, instead of
+testing Linux as a black box, rtla leverages kernel tracing
+capabilities to provide precise information about the properties
+and root causes of unexpected results.
+
+Installing RTLA
+
+RTLA depends on some libraries and tools. More precisely, it depends on the
+following libraries:
+
+ - libtracefs
+ - libtraceevent
+ - procps
+
+It also depends on python3-docutils to compile man pages.
+
+For development, we suggest the following steps for compiling rtla:
+
+  $ git clone git://git.kernel.org/pub/scm/libs/libtrace/libtraceevent.git
+  $ cd libtraceevent/
+  $ make
+  $ sudo make install
+  $ cd ..
+  $ git clone git://git.kernel.org/pub/scm/libs/libtrace/libtracefs.git
+  $ cd libtracefs/
+  $ make
+  $ sudo make install
+  $ cd ..
+  $ cd $rtla_src
+  $ make
+  $ sudo make install
+
+For further information, please refer to the rtla man page.
diff --git a/tools/tracing/rtla/src/rtla.c b/tools/tracing/rtla/src/rtla.c
new file mode 100644
index 000000000000..5ae2664ed47d
--- /dev/null
+++ b/tools/tracing/rtla/src/rtla.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Red Hat Inc, Daniel Bristot de Oliveira <bristot@kernel.org>
+ */
+
+#include <getopt.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+/*
+ * rtla_usage - print rtla usage
+ */
+static void rtla_usage(void)
+{
+	int i;
+
+	static const char *msg[] = {
+		"",
+		"rtla version " VERSION,
+		"",
+		"  usage: rtla COMMAND ...",
+		"",
+		"  commands:",
+		"",
+		NULL,
+	};
+
+	for (i = 0; msg[i]; i++)
+		fprintf(stderr, "%s\n", msg[i]);
+	exit(1);
+}
+
+/*
+ * run_command - try to run a rtla tool command
+ *
+ * It returns 0 if it fails. The tool's main will generally not
+ * return as they should call exit().
+ */
+int run_command(int argc, char **argv, int start_position)
+{
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	int retval;
+
+	/* is it an alias? */
+	retval = run_command(argc, argv, 0);
+	if (retval)
+		exit(0);
+
+	if (argc < 2)
+		goto usage;
+
+	if (strcmp(argv[1], "-h") == 0) {
+		rtla_usage();
+		exit(0);
+	} else if (strcmp(argv[1], "--help") == 0) {
+		rtla_usage();
+		exit(0);
+	}
+
+	retval = run_command(argc, argv, 1);
+	if (retval)
+		exit(0);
+
+usage:
+	rtla_usage();
+	exit(1);
+}
-- 
cgit v1.2.3


From b1696371d865fa28bd20adb13d1820ef031f2ee2 Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@kernel.org>
Date: Fri, 10 Dec 2021 19:11:21 +0100
Subject: rtla: Helper functions for rtla

This is a set of utils and tracer helper functions. They are used by
rtla mostly to parse config, display data and some trace operations that
are not part of libtracefs (because they are only useful it for this
case).

Link: https://lkml.kernel.org/r/a94c128aba9e6e66d502b7094f2e8c7ac95b12e5.1639158831.git.bristot@kernel.org

Cc: Tao Zhou <tao.zhou@linux.dev>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Daniel Bristot de Oliveira <bristot@kernel.org>
Cc: linux-rt-users@vger.kernel.org
Cc: linux-trace-devel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 tools/tracing/rtla/src/trace.c | 192 ++++++++++++++++++
 tools/tracing/rtla/src/trace.h |  27 +++
 tools/tracing/rtla/src/utils.c | 433 +++++++++++++++++++++++++++++++++++++++++
 tools/tracing/rtla/src/utils.h |  56 ++++++
 4 files changed, 708 insertions(+)
 create mode 100644 tools/tracing/rtla/src/trace.c
 create mode 100644 tools/tracing/rtla/src/trace.h
 create mode 100644 tools/tracing/rtla/src/utils.c
 create mode 100644 tools/tracing/rtla/src/utils.h

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/trace.c b/tools/tracing/rtla/src/trace.c
new file mode 100644
index 000000000000..107a0c6387f7
--- /dev/null
+++ b/tools/tracing/rtla/src/trace.c
@@ -0,0 +1,192 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <sys/sendfile.h>
+#include <tracefs.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+
+#include "trace.h"
+#include "utils.h"
+
+/*
+ * enable_tracer_by_name - enable a tracer on the given instance
+ */
+int enable_tracer_by_name(struct tracefs_instance *inst, const char *tracer_name)
+{
+	enum tracefs_tracers tracer;
+	int retval;
+
+	tracer = TRACEFS_TRACER_CUSTOM;
+
+	debug_msg("enabling %s tracer\n", tracer_name);
+
+	retval = tracefs_tracer_set(inst, tracer, tracer_name);
+	if (retval < 0) {
+		if (errno == ENODEV)
+			err_msg("tracer %s not found!\n", tracer_name);
+
+		err_msg("failed to enable the tracer %s\n", tracer_name);
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * disable_tracer - set nop tracer to the insta
+ */
+void disable_tracer(struct tracefs_instance *inst)
+{
+	enum tracefs_tracers t = TRACEFS_TRACER_NOP;
+	int retval;
+
+	retval = tracefs_tracer_set(inst, t);
+	if (retval < 0)
+		err_msg("oops, error disabling tracer\n");
+}
+
+/*
+ * create_instance - create a trace instance with *instance_name
+ */
+struct tracefs_instance *create_instance(char *instance_name)
+{
+	return tracefs_instance_create(instance_name);
+}
+
+/*
+ * destroy_instance - remove a trace instance and free the data
+ */
+void destroy_instance(struct tracefs_instance *inst)
+{
+	tracefs_instance_destroy(inst);
+	tracefs_instance_free(inst);
+}
+
+/*
+ * save_trace_to_file - save the trace output of the instance to the file
+ */
+int save_trace_to_file(struct tracefs_instance *inst, const char *filename)
+{
+	const char *file = "trace";
+	mode_t mode = 0644;
+	char buffer[4096];
+	int out_fd, in_fd;
+	int retval = -1;
+
+	in_fd = tracefs_instance_file_open(inst, file, O_RDONLY);
+	if (in_fd < 0) {
+		err_msg("Failed to open trace file\n");
+		return -1;
+	}
+
+	out_fd = creat(filename, mode);
+	if (out_fd < 0) {
+		err_msg("Failed to create output file %s\n", filename);
+		goto out_close_in;
+	}
+
+	do {
+		retval = read(in_fd, buffer, sizeof(buffer));
+		if (retval <= 0)
+			goto out_close;
+
+		retval = write(out_fd, buffer, retval);
+		if (retval < 0)
+			goto out_close;
+	} while (retval > 0);
+
+	retval = 0;
+out_close:
+	close(out_fd);
+out_close_in:
+	close(in_fd);
+	return retval;
+}
+
+/*
+ * collect_registered_events - call the existing callback function for the event
+ *
+ * If an event has a registered callback function, call it.
+ * Otherwise, ignore the event.
+ */
+int
+collect_registered_events(struct tep_event *event, struct tep_record *record,
+			  int cpu, void *context)
+{
+	struct trace_instance *trace = context;
+	struct trace_seq *s = trace->seq;
+
+	if (!event->handler)
+		return 0;
+
+	event->handler(s, record, event, context);
+
+	return 0;
+}
+
+/*
+ * trace_instance_destroy - destroy and free a rtla trace instance
+ */
+void trace_instance_destroy(struct trace_instance *trace)
+{
+	if (trace->inst) {
+		disable_tracer(trace->inst);
+		destroy_instance(trace->inst);
+	}
+
+	if (trace->seq)
+		free(trace->seq);
+
+	if (trace->tep)
+		tep_free(trace->tep);
+}
+
+/*
+ * trace_instance_init - create an rtla trace instance
+ *
+ * It is more than the tracefs instance, as it contains other
+ * things required for the tracing, such as the local events and
+ * a seq file.
+ *
+ * Note that the trace instance is returned disabled. This allows
+ * the tool to apply some other configs, like setting priority
+ * to the kernel threads, before starting generating trace entries.
+ */
+int trace_instance_init(struct trace_instance *trace, char *tool_name)
+{
+	trace->seq = calloc(1, sizeof(*trace->seq));
+	if (!trace->seq)
+		goto out_err;
+
+	trace_seq_init(trace->seq);
+
+	trace->inst = create_instance(tool_name);
+	if (!trace->inst)
+		goto out_err;
+
+	trace->tep = tracefs_local_events(NULL);
+	if (!trace->tep)
+		goto out_err;
+
+	/*
+	 * Let the main enable the record after setting some other
+	 * things such as the priority of the tracer's threads.
+	 */
+	tracefs_trace_off(trace->inst);
+
+	return 0;
+
+out_err:
+	trace_instance_destroy(trace);
+	return 1;
+}
+
+/*
+ * trace_instance_start - start tracing a given rtla instance
+ */
+int trace_instance_start(struct trace_instance *trace)
+{
+	return tracefs_trace_on(trace->inst);
+}
diff --git a/tools/tracing/rtla/src/trace.h b/tools/tracing/rtla/src/trace.h
new file mode 100644
index 000000000000..0ea1df0ad9a7
--- /dev/null
+++ b/tools/tracing/rtla/src/trace.h
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <tracefs.h>
+#include <stddef.h>
+
+struct trace_instance {
+	struct tracefs_instance		*inst;
+	struct tep_handle		*tep;
+	struct trace_seq		*seq;
+};
+
+int trace_instance_init(struct trace_instance *trace, char *tool_name);
+int trace_instance_start(struct trace_instance *trace);
+void trace_instance_destroy(struct trace_instance *trace);
+
+struct trace_seq *get_trace_seq(void);
+int enable_tracer_by_name(struct tracefs_instance *inst, const char *tracer_name);
+void disable_tracer(struct tracefs_instance *inst);
+
+int enable_osnoise(struct trace_instance *trace);
+int enable_timerlat(struct trace_instance *trace);
+
+struct tracefs_instance *create_instance(char *instance_name);
+void destroy_instance(struct tracefs_instance *inst);
+
+int save_trace_to_file(struct tracefs_instance *inst, const char *filename);
+int collect_registered_events(struct tep_event *tep, struct tep_record *record,
+			      int cpu, void *context);
diff --git a/tools/tracing/rtla/src/utils.c b/tools/tracing/rtla/src/utils.c
new file mode 100644
index 000000000000..1c9f0eea6166
--- /dev/null
+++ b/tools/tracing/rtla/src/utils.c
@@ -0,0 +1,433 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Red Hat Inc, Daniel Bristot de Oliveira <bristot@kernel.org>
+ */
+
+#include <proc/readproc.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <errno.h>
+#include <sched.h>
+#include <stdio.h>
+
+#include "utils.h"
+
+#define MAX_MSG_LENGTH	1024
+int config_debug;
+
+/*
+ * err_msg - print an error message to the stderr
+ */
+void err_msg(const char *fmt, ...)
+{
+	char message[MAX_MSG_LENGTH];
+	va_list ap;
+
+	va_start(ap, fmt);
+	vsnprintf(message, sizeof(message), fmt, ap);
+	va_end(ap);
+
+	fprintf(stderr, "%s", message);
+}
+
+/*
+ * debug_msg - print a debug message to stderr if debug is set
+ */
+void debug_msg(const char *fmt, ...)
+{
+	char message[MAX_MSG_LENGTH];
+	va_list ap;
+
+	if (!config_debug)
+		return;
+
+	va_start(ap, fmt);
+	vsnprintf(message, sizeof(message), fmt, ap);
+	va_end(ap);
+
+	fprintf(stderr, "%s", message);
+}
+
+/*
+ * get_llong_from_str - get a long long int from a string
+ */
+long long get_llong_from_str(char *start)
+{
+	long long value;
+	char *end;
+
+	errno = 0;
+	value = strtoll(start, &end, 10);
+	if (errno || start == end)
+		return -1;
+
+	return value;
+}
+
+/*
+ * get_duration - fill output with a human readable duration since start_time
+ */
+void get_duration(time_t start_time, char *output, int output_size)
+{
+	time_t now = time(NULL);
+	struct tm *tm_info;
+	time_t duration;
+
+	duration = difftime(now, start_time);
+	tm_info = localtime(&duration);
+
+	snprintf(output, output_size, "%3d %02d:%02d:%02d",
+			tm_info->tm_yday,
+			tm_info->tm_hour - 1,
+			tm_info->tm_min,
+			tm_info->tm_sec);
+}
+
+/*
+ * parse_cpu_list - parse a cpu_list filling a char vector with cpus set
+ *
+ * Receives a cpu list, like 1-3,5 (cpus 1, 2, 3, 5), and then set the char
+ * in the monitored_cpus.
+ *
+ * XXX: convert to a bitmask.
+ */
+int parse_cpu_list(char *cpu_list, char **monitored_cpus)
+{
+	char *mon_cpus;
+	const char *p;
+	int end_cpu;
+	int nr_cpus;
+	int cpu;
+	int i;
+
+	nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+
+	mon_cpus = malloc(nr_cpus * sizeof(char));
+	memset(mon_cpus, 0, (nr_cpus * sizeof(char)));
+
+	for (p = cpu_list; *p; ) {
+		cpu = atoi(p);
+		if (cpu < 0 || (!cpu && *p != '0') || cpu >= nr_cpus)
+			goto err;
+
+		while (isdigit(*p))
+			p++;
+		if (*p == '-') {
+			p++;
+			end_cpu = atoi(p);
+			if (end_cpu < cpu || (!end_cpu && *p != '0') || end_cpu >= nr_cpus)
+				goto err;
+			while (isdigit(*p))
+				p++;
+		} else
+			end_cpu = cpu;
+
+		if (cpu == end_cpu) {
+			debug_msg("cpu_list: adding cpu %d\n", cpu);
+			mon_cpus[cpu] = 1;
+		} else {
+			for (i = cpu; i <= end_cpu; i++) {
+				debug_msg("cpu_list: adding cpu %d\n", i);
+				mon_cpus[i] = 1;
+			}
+		}
+
+		if (*p == ',')
+			p++;
+	}
+
+	*monitored_cpus = mon_cpus;
+
+	return 0;
+
+err:
+	debug_msg("Error parsing the cpu list %s", cpu_list);
+	return 1;
+}
+
+/*
+ * parse_duration - parse duration with s/m/h/d suffix converting it to seconds
+ */
+long parse_seconds_duration(char *val)
+{
+	char *end;
+	long t;
+
+	t = strtol(val, &end, 10);
+
+	if (end) {
+		switch (*end) {
+		case 's':
+		case 'S':
+			break;
+		case 'm':
+		case 'M':
+			t *= 60;
+			break;
+		case 'h':
+		case 'H':
+			t *= 60 * 60;
+			break;
+
+		case 'd':
+		case 'D':
+			t *= 24 * 60 * 60;
+			break;
+		}
+	}
+
+	return t;
+}
+
+/*
+ * parse_ns_duration - parse duration with ns/us/ms/s converting it to nanoseconds
+ */
+long parse_ns_duration(char *val)
+{
+	char *end;
+	long t;
+
+	t = strtol(val, &end, 10);
+
+	if (end) {
+		if (!strncmp(end, "ns", 2)) {
+			return t;
+		} else if (!strncmp(end, "us", 2)) {
+			t *= 1000;
+			return t;
+		} else if (!strncmp(end, "ms", 2)) {
+			t *= 1000 * 1000;
+			return t;
+		} else if (!strncmp(end, "s", 1)) {
+			t *= 1000 * 1000 * 1000;
+			return t;
+		}
+		return -1;
+	}
+
+	return t;
+}
+
+/*
+ * This is a set of helper functions to use SCHED_DEADLINE.
+ */
+#ifdef __x86_64__
+# define __NR_sched_setattr	314
+# define __NR_sched_getattr	315
+#elif __i386__
+# define __NR_sched_setattr	351
+# define __NR_sched_getattr	352
+#elif __arm__
+# define __NR_sched_setattr	380
+# define __NR_sched_getattr	381
+#elif __aarch64__
+# define __NR_sched_setattr	274
+# define __NR_sched_getattr	275
+#elif __powerpc__
+# define __NR_sched_setattr	355
+# define __NR_sched_getattr	356
+#elif __s390x__
+# define __NR_sched_setattr	345
+# define __NR_sched_getattr	346
+#endif
+
+#define SCHED_DEADLINE		6
+
+static inline int sched_setattr(pid_t pid, const struct sched_attr *attr,
+				unsigned int flags) {
+	return syscall(__NR_sched_setattr, pid, attr, flags);
+}
+
+static inline int sched_getattr(pid_t pid, struct sched_attr *attr,
+				unsigned int size, unsigned int flags)
+{
+	return syscall(__NR_sched_getattr, pid, attr, size, flags);
+}
+
+int __set_sched_attr(int pid, struct sched_attr *attr)
+{
+	int flags = 0;
+	int retval;
+
+	retval = sched_setattr(pid, attr, flags);
+	if (retval < 0) {
+		err_msg("boost_with_deadline failed to boost pid %d: %s\n",
+			pid, strerror(errno));
+		return 1;
+	}
+
+	return 0;
+}
+/*
+ * set_comm_sched_attr - set sched params to threads starting with char *comm
+ *
+ * This function uses procps to list the currently running threads and then
+ * set the sched_attr *attr to the threads that start with char *comm. It is
+ * mainly used to set the priority to the kernel threads created by the
+ * tracers.
+ */
+int set_comm_sched_attr(const char *comm, struct sched_attr *attr)
+{
+	int flags = PROC_FILLCOM | PROC_FILLSTAT;
+	PROCTAB *ptp;
+	proc_t task;
+	int retval;
+
+	ptp = openproc(flags);
+	if (!ptp) {
+		err_msg("error openproc()\n");
+		return -ENOENT;
+	}
+
+	memset(&task, 0, sizeof(task));
+
+	while (readproc(ptp, &task)) {
+		retval = strncmp(comm, task.cmd, strlen(comm));
+		if (retval)
+			continue;
+		retval = __set_sched_attr(task.tid, attr);
+		if (retval)
+			goto out_err;
+	}
+
+	closeproc(ptp);
+	return 0;
+
+out_err:
+	closeproc(ptp);
+	return 1;
+}
+
+#define INVALID_VAL	(~0L)
+static long get_long_ns_after_colon(char *start)
+{
+	long val = INVALID_VAL;
+
+	/* find the ":" */
+	start = strstr(start, ":");
+	if (!start)
+		return -1;
+
+	/* skip ":" */
+	start++;
+	val = parse_ns_duration(start);
+
+	return val;
+}
+
+static long get_long_after_colon(char *start)
+{
+	long val = INVALID_VAL;
+
+	/* find the ":" */
+	start = strstr(start, ":");
+	if (!start)
+		return -1;
+
+	/* skip ":" */
+	start++;
+	val = get_llong_from_str(start);
+
+	return val;
+}
+
+/*
+ * parse priority in the format:
+ * SCHED_OTHER:
+ *		o:<prio>
+ *		O:<prio>
+ * SCHED_RR:
+ *		r:<prio>
+ *		R:<prio>
+ * SCHED_FIFO:
+ *		f:<prio>
+ *		F:<prio>
+ * SCHED_DEADLINE:
+ *		d:runtime:period
+ *		D:runtime:period
+ */
+int parse_prio(char *arg, struct sched_attr *sched_param)
+{
+	long prio;
+	long runtime;
+	long period;
+
+	memset(sched_param, 0, sizeof(*sched_param));
+	sched_param->size = sizeof(*sched_param);
+
+	switch (arg[0]) {
+	case 'd':
+	case 'D':
+		/* d:runtime:period */
+		if (strlen(arg) < 4)
+			return -1;
+
+		runtime = get_long_ns_after_colon(arg);
+		if (runtime == INVALID_VAL)
+			return -1;
+
+		period = get_long_ns_after_colon(&arg[2]);
+		if (period == INVALID_VAL)
+			return -1;
+
+		if (runtime > period)
+			return -1;
+
+		sched_param->sched_policy   = SCHED_DEADLINE;
+		sched_param->sched_runtime  = runtime;
+		sched_param->sched_deadline = period;
+		sched_param->sched_period   = period;
+		break;
+	case 'f':
+	case 'F':
+		/* f:prio */
+		prio = get_long_after_colon(arg);
+		if (prio == INVALID_VAL)
+			return -1;
+
+		if (prio < sched_get_priority_min(SCHED_FIFO))
+			return -1;
+		if (prio > sched_get_priority_max(SCHED_FIFO))
+			return -1;
+
+		sched_param->sched_policy   = SCHED_FIFO;
+		sched_param->sched_priority = prio;
+		break;
+	case 'r':
+	case 'R':
+		/* r:prio */
+		prio = get_long_after_colon(arg);
+		if (prio == INVALID_VAL)
+			return -1;
+
+		if (prio < sched_get_priority_min(SCHED_RR))
+			return -1;
+		if (prio > sched_get_priority_max(SCHED_RR))
+			return -1;
+
+		sched_param->sched_policy   = SCHED_RR;
+		sched_param->sched_priority = prio;
+		break;
+	case 'o':
+	case 'O':
+		/* o:prio */
+		prio = get_long_after_colon(arg);
+		if (prio == INVALID_VAL)
+			return -1;
+
+		if (prio < sched_get_priority_min(SCHED_OTHER))
+			return -1;
+		if (prio > sched_get_priority_max(SCHED_OTHER))
+			return -1;
+
+		sched_param->sched_policy   = SCHED_OTHER;
+		sched_param->sched_priority = prio;
+		break;
+	default:
+		return -1;
+	}
+	return 0;
+}
diff --git a/tools/tracing/rtla/src/utils.h b/tools/tracing/rtla/src/utils.h
new file mode 100644
index 000000000000..9aa962319ca2
--- /dev/null
+++ b/tools/tracing/rtla/src/utils.h
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdint.h>
+#include <time.h>
+
+/*
+ * '18446744073709551615\0'
+ */
+#define BUFF_U64_STR_SIZE	24
+
+#define container_of(ptr, type, member)({			\
+	const typeof(((type *)0)->member) *__mptr = (ptr);	\
+	(type *)((char *)__mptr - offsetof(type, member)) ; })
+
+extern int config_debug;
+void debug_msg(const char *fmt, ...);
+void err_msg(const char *fmt, ...);
+
+long parse_seconds_duration(char *val);
+void get_duration(time_t start_time, char *output, int output_size);
+
+int parse_cpu_list(char *cpu_list, char **monitored_cpus);
+long long get_llong_from_str(char *start);
+
+static inline void
+update_min(unsigned long long *a, unsigned long long *b)
+{
+	if (*a > *b)
+		*a = *b;
+}
+
+static inline void
+update_max(unsigned long long *a, unsigned long long *b)
+{
+	if (*a < *b)
+		*a = *b;
+}
+
+static inline void
+update_sum(unsigned long long *a, unsigned long long *b)
+{
+	*a += *b;
+}
+
+struct sched_attr {
+	uint32_t size;
+	uint32_t sched_policy;
+	uint64_t sched_flags;
+	int32_t sched_nice;
+	uint32_t sched_priority;
+	uint64_t sched_runtime;
+	uint64_t sched_deadline;
+	uint64_t sched_period;
+};
+
+int parse_prio(char *arg, struct sched_attr *sched_param);
+int set_comm_sched_attr(const char *comm, struct sched_attr *attr);
-- 
cgit v1.2.3


From 0605bf009f18d558758df9edef59698bf1389b28 Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@kernel.org>
Date: Fri, 10 Dec 2021 19:11:22 +0100
Subject: rtla: Add osnoise tool

The osnoise tool is the interface for the osnoise tracer. The osnoise
tool will have multiple "modes" with different outputs. At this point,
no mode is included.

The osnoise.c includes the osnoise_context abstraction. It serves to
read-save-change-restore the default values from tracing/osnoise/
directory. When the context is deleted, the default values are restored.

It also includes some other helper functions for managing osnoise
tracer sessions.

With these bits and pieces in place, we can start adding some
functionality to rtla.

Link: https://lkml.kernel.org/r/2d44c21ff561f503b4c7b1813892761818118460.1639158831.git.bristot@kernel.org

Cc: Tao Zhou <tao.zhou@linux.dev>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Daniel Bristot de Oliveira <bristot@kernel.org>
Cc: linux-rt-users@vger.kernel.org
Cc: linux-trace-devel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 tools/tracing/rtla/Makefile      |   2 +
 tools/tracing/rtla/src/osnoise.c | 855 +++++++++++++++++++++++++++++++++++++++
 tools/tracing/rtla/src/osnoise.h |  89 ++++
 tools/tracing/rtla/src/rtla.c    |  10 +
 4 files changed, 956 insertions(+)
 create mode 100644 tools/tracing/rtla/src/osnoise.c
 create mode 100644 tools/tracing/rtla/src/osnoise.h

(limited to 'tools')

diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile
index 56b1bdd60c96..d40b442fcd30 100644
--- a/tools/tracing/rtla/Makefile
+++ b/tools/tracing/rtla/Makefile
@@ -60,6 +60,8 @@ install:
 	$(INSTALL) -d -m 755 $(DESTDIR)$(BINDIR)
 	$(INSTALL) rtla -m 755 $(DESTDIR)$(BINDIR)
 	$(STRIP) $(DESTDIR)$(BINDIR)/rtla
+	@test ! -f $(DESTDIR)$(BINDIR)/osnoise || rm $(DESTDIR)$(BINDIR)/osnoise
+	ln -s $(DESTDIR)$(BINDIR)/rtla $(DESTDIR)$(BINDIR)/osnoise
 
 .PHONY: clean tarball
 clean:
diff --git a/tools/tracing/rtla/src/osnoise.c b/tools/tracing/rtla/src/osnoise.c
new file mode 100644
index 000000000000..e5254044e878
--- /dev/null
+++ b/tools/tracing/rtla/src/osnoise.c
@@ -0,0 +1,855 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Red Hat Inc, Daniel Bristot de Oliveira <bristot@kernel.org>
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <pthread.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+
+#include "osnoise.h"
+#include "utils.h"
+
+/*
+ * osnoise_get_cpus - return the original "osnoise/cpus" content
+ *
+ * It also saves the value to be restored.
+ */
+char *osnoise_get_cpus(struct osnoise_context *context)
+{
+	if (context->curr_cpus)
+		return context->curr_cpus;
+
+	if (context->orig_cpus)
+		return context->orig_cpus;
+
+	context->orig_cpus = tracefs_instance_file_read(NULL, "osnoise/cpus", NULL);
+
+	/*
+	 * The error value (NULL) is the same for tracefs_instance_file_read()
+	 * and this functions, so:
+	 */
+	return context->orig_cpus;
+}
+
+/*
+ * osnoise_set_cpus - configure osnoise to run on *cpus
+ *
+ * "osnoise/cpus" file is used to set the cpus in which osnoise/timerlat
+ * will run. This function opens this file, saves the current value,
+ * and set the cpus passed as argument.
+ */
+int osnoise_set_cpus(struct osnoise_context *context, char *cpus)
+{
+	char *orig_cpus = osnoise_get_cpus(context);
+	char buffer[1024];
+	int retval;
+
+	if (!orig_cpus)
+		return -1;
+
+	context->curr_cpus = strdup(cpus);
+	if (!context->curr_cpus)
+		return -1;
+
+	snprintf(buffer, 1024, "%s\n", cpus);
+
+	debug_msg("setting cpus to %s from %s", cpus, context->orig_cpus);
+
+	retval = tracefs_instance_file_write(NULL, "osnoise/cpus", buffer);
+	if (retval < 0) {
+		free(context->curr_cpus);
+		context->curr_cpus = NULL;
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * osnoise_restore_cpus - restore the original "osnoise/cpus"
+ *
+ * osnoise_set_cpus() saves the original data for the "osnoise/cpus"
+ * file. This function restore the original config it was previously
+ * modified.
+ */
+void osnoise_restore_cpus(struct osnoise_context *context)
+{
+	int retval;
+
+	if (!context->orig_cpus)
+		return;
+
+	if (!context->curr_cpus)
+		return;
+
+	/* nothing to do? */
+	if (!strcmp(context->orig_cpus, context->curr_cpus))
+		goto out_done;
+
+	debug_msg("restoring cpus to %s", context->orig_cpus);
+
+	retval = tracefs_instance_file_write(NULL, "osnoise/cpus", context->orig_cpus);
+	if (retval < 0)
+		err_msg("could not restore original osnoise cpus\n");
+
+out_done:
+	free(context->curr_cpus);
+	context->curr_cpus = NULL;
+}
+
+/*
+ * osnoise_put_cpus - restore cpus config and cleanup data
+ */
+void osnoise_put_cpus(struct osnoise_context *context)
+{
+	osnoise_restore_cpus(context);
+
+	if (!context->orig_cpus)
+		return;
+
+	free(context->orig_cpus);
+	context->orig_cpus = NULL;
+}
+
+/*
+ * osnoise_read_ll_config - read a long long value from a config
+ *
+ * returns -1 on error.
+ */
+static long long osnoise_read_ll_config(char *rel_path)
+{
+	long long retval;
+	char *buffer;
+
+	buffer = tracefs_instance_file_read(NULL, rel_path, NULL);
+	if (!buffer)
+		return -1;
+
+	/* get_llong_from_str returns -1 on error */
+	retval = get_llong_from_str(buffer);
+
+	debug_msg("reading %s returned %lld\n", rel_path, retval);
+
+	free(buffer);
+
+	return retval;
+}
+
+/*
+ * osnoise_write_ll_config - write a long long value to a config in rel_path
+ *
+ * returns -1 on error.
+ */
+static long long osnoise_write_ll_config(char *rel_path, long long value)
+{
+	char buffer[BUFF_U64_STR_SIZE];
+	long long retval;
+
+	snprintf(buffer, sizeof(buffer), "%lld\n", value);
+
+	debug_msg("setting %s to %lld\n", rel_path, value);
+
+	retval = tracefs_instance_file_write(NULL, rel_path, buffer);
+	return retval;
+}
+
+/*
+ * osnoise_get_runtime - return the original "osnoise/runtime_us" value
+ *
+ * It also saves the value to be restored.
+ */
+unsigned long long osnoise_get_runtime(struct osnoise_context *context)
+{
+	long long runtime_us;
+
+	if (context->runtime_us != OSNOISE_TIME_INIT_VAL)
+		return context->runtime_us;
+
+	if (context->orig_runtime_us != OSNOISE_TIME_INIT_VAL)
+		return context->orig_runtime_us;
+
+	runtime_us = osnoise_read_ll_config("osnoise/runtime_us");
+	if (runtime_us < 0)
+		goto out_err;
+
+	context->orig_runtime_us = runtime_us;
+	return runtime_us;
+
+out_err:
+	return OSNOISE_TIME_INIT_VAL;
+}
+
+/*
+ * osnoise_get_period - return the original "osnoise/period_us" value
+ *
+ * It also saves the value to be restored.
+ */
+unsigned long long osnoise_get_period(struct osnoise_context *context)
+{
+	long long period_us;
+
+	if (context->period_us != OSNOISE_TIME_INIT_VAL)
+		return context->period_us;
+
+	if (context->orig_period_us != OSNOISE_TIME_INIT_VAL)
+		return context->orig_period_us;
+
+	period_us = osnoise_read_ll_config("osnoise/period_us");
+	if (period_us < 0)
+		goto out_err;
+
+	context->orig_period_us = period_us;
+	return period_us;
+
+out_err:
+	return OSNOISE_TIME_INIT_VAL;
+}
+
+static int __osnoise_write_runtime(struct osnoise_context *context,
+				   unsigned long long runtime)
+{
+	int retval;
+
+	if (context->orig_runtime_us == OSNOISE_TIME_INIT_VAL)
+		return -1;
+
+	retval = osnoise_write_ll_config("osnoise/runtime_us", runtime);
+	if (retval < 0)
+		return -1;
+
+	context->runtime_us = runtime;
+	return 0;
+}
+
+static int __osnoise_write_period(struct osnoise_context *context,
+				  unsigned long long period)
+{
+	int retval;
+
+	if (context->orig_period_us == OSNOISE_TIME_INIT_VAL)
+		return -1;
+
+	retval = osnoise_write_ll_config("osnoise/period_us", period);
+	if (retval < 0)
+		return -1;
+
+	context->period_us = period;
+	return 0;
+}
+
+/*
+ * osnoise_set_runtime_period - set osnoise runtime and period
+ *
+ * Osnoise's runtime and period are related as runtime <= period.
+ * Thus, this function saves the original values, and then tries
+ * to set the runtime and period if they are != 0.
+ */
+int osnoise_set_runtime_period(struct osnoise_context *context,
+			       unsigned long long runtime,
+			       unsigned long long period)
+{
+	unsigned long long curr_runtime_us;
+	unsigned long long curr_period_us;
+	int retval;
+
+	if (!period && !runtime)
+		return 0;
+
+	curr_runtime_us = osnoise_get_runtime(context);
+	curr_period_us = osnoise_get_period(context);
+
+	/* error getting any value? */
+	if (curr_period_us == OSNOISE_TIME_INIT_VAL || curr_runtime_us == OSNOISE_TIME_INIT_VAL)
+		return -1;
+
+	if (!period) {
+		if (runtime > curr_period_us)
+			return -1;
+		return __osnoise_write_runtime(context, runtime);
+	} else if (!runtime) {
+		if (period < curr_runtime_us)
+			return -1;
+		return __osnoise_write_period(context, period);
+	}
+
+	if (runtime > curr_period_us) {
+		retval = __osnoise_write_period(context, period);
+		if (retval)
+			return -1;
+		retval = __osnoise_write_runtime(context, runtime);
+		if (retval)
+			return -1;
+	} else {
+		retval = __osnoise_write_runtime(context, runtime);
+		if (retval)
+			return -1;
+		retval = __osnoise_write_period(context, period);
+		if (retval)
+			return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * osnoise_restore_runtime_period - restore the original runtime and period
+ */
+void osnoise_restore_runtime_period(struct osnoise_context *context)
+{
+	unsigned long long orig_runtime = context->orig_runtime_us;
+	unsigned long long orig_period = context->orig_period_us;
+	unsigned long long curr_runtime = context->runtime_us;
+	unsigned long long curr_period = context->period_us;
+	int retval;
+
+	if ((orig_runtime == OSNOISE_TIME_INIT_VAL) && (orig_period == OSNOISE_TIME_INIT_VAL))
+		return;
+
+	if ((orig_period == curr_period) && (orig_runtime == curr_runtime))
+		goto out_done;
+
+	retval = osnoise_set_runtime_period(context, orig_runtime, orig_period);
+	if (retval)
+		err_msg("Could not restore original osnoise runtime/period\n");
+
+out_done:
+	context->runtime_us = OSNOISE_TIME_INIT_VAL;
+	context->period_us = OSNOISE_TIME_INIT_VAL;
+}
+
+/*
+ * osnoise_put_runtime_period - restore original values and cleanup data
+ */
+void osnoise_put_runtime_period(struct osnoise_context *context)
+{
+	osnoise_restore_runtime_period(context);
+
+	if (context->orig_runtime_us != OSNOISE_TIME_INIT_VAL)
+		context->orig_runtime_us = OSNOISE_TIME_INIT_VAL;
+
+	if (context->orig_period_us != OSNOISE_TIME_INIT_VAL)
+		context->orig_period_us = OSNOISE_TIME_INIT_VAL;
+}
+
+/*
+ * osnoise_get_timerlat_period_us - read and save the original "timerlat_period_us"
+ */
+static long long
+osnoise_get_timerlat_period_us(struct osnoise_context *context)
+{
+	long long timerlat_period_us;
+
+	if (context->timerlat_period_us != OSNOISE_TIME_INIT_VAL)
+		return context->timerlat_period_us;
+
+	if (context->orig_timerlat_period_us != OSNOISE_TIME_INIT_VAL)
+		return context->orig_timerlat_period_us;
+
+	timerlat_period_us = osnoise_read_ll_config("osnoise/timerlat_period_us");
+	if (timerlat_period_us < 0)
+		goto out_err;
+
+	context->orig_timerlat_period_us = timerlat_period_us;
+	return timerlat_period_us;
+
+out_err:
+	return OSNOISE_TIME_INIT_VAL;
+}
+
+/*
+ * osnoise_set_timerlat_period_us - set "timerlat_period_us"
+ */
+int osnoise_set_timerlat_period_us(struct osnoise_context *context, long long timerlat_period_us)
+{
+	long long curr_timerlat_period_us = osnoise_get_timerlat_period_us(context);
+	int retval;
+
+	if (curr_timerlat_period_us == OSNOISE_TIME_INIT_VAL)
+		return -1;
+
+	retval = osnoise_write_ll_config("osnoise/timerlat_period_us", timerlat_period_us);
+	if (retval < 0)
+		return -1;
+
+	context->timerlat_period_us = timerlat_period_us;
+
+	return 0;
+}
+
+/*
+ * osnoise_restore_timerlat_period_us - restore "timerlat_period_us"
+ */
+void osnoise_restore_timerlat_period_us(struct osnoise_context *context)
+{
+	int retval;
+
+	if (context->orig_timerlat_period_us == OSNOISE_TIME_INIT_VAL)
+		return;
+
+	if (context->orig_timerlat_period_us == context->timerlat_period_us)
+		goto out_done;
+
+	retval = osnoise_write_ll_config("osnoise/timerlat_period_us", context->orig_timerlat_period_us);
+	if (retval < 0)
+		err_msg("Could not restore original osnoise timerlat_period_us\n");
+
+out_done:
+	context->timerlat_period_us = OSNOISE_TIME_INIT_VAL;
+}
+
+/*
+ * osnoise_put_timerlat_period_us - restore original values and cleanup data
+ */
+void osnoise_put_timerlat_period_us(struct osnoise_context *context)
+{
+	osnoise_restore_timerlat_period_us(context);
+
+	if (context->orig_timerlat_period_us == OSNOISE_TIME_INIT_VAL)
+		return;
+
+	context->orig_timerlat_period_us = OSNOISE_TIME_INIT_VAL;
+}
+
+/*
+ * osnoise_get_stop_us - read and save the original "stop_tracing_us"
+ */
+static long long
+osnoise_get_stop_us(struct osnoise_context *context)
+{
+	long long stop_us;
+
+	if (context->stop_us != OSNOISE_OPTION_INIT_VAL)
+		return context->stop_us;
+
+	if (context->orig_stop_us != OSNOISE_OPTION_INIT_VAL)
+		return context->orig_stop_us;
+
+	stop_us = osnoise_read_ll_config("osnoise/stop_tracing_us");
+	if (stop_us < 0)
+		goto out_err;
+
+	context->orig_stop_us = stop_us;
+	return stop_us;
+
+out_err:
+	return OSNOISE_OPTION_INIT_VAL;
+}
+
+/*
+ * osnoise_set_stop_us - set "stop_tracing_us"
+ */
+int osnoise_set_stop_us(struct osnoise_context *context, long long stop_us)
+{
+	long long curr_stop_us = osnoise_get_stop_us(context);
+	int retval;
+
+	if (curr_stop_us == OSNOISE_OPTION_INIT_VAL)
+		return -1;
+
+	retval = osnoise_write_ll_config("osnoise/stop_tracing_us", stop_us);
+	if (retval < 0)
+		return -1;
+
+	context->stop_us = stop_us;
+
+	return 0;
+}
+
+/*
+ * osnoise_restore_stop_us - restore the original "stop_tracing_us"
+ */
+void osnoise_restore_stop_us(struct osnoise_context *context)
+{
+	int retval;
+
+	if (context->orig_stop_us == OSNOISE_OPTION_INIT_VAL)
+		return;
+
+	if (context->orig_stop_us == context->stop_us)
+		goto out_done;
+
+	retval = osnoise_write_ll_config("osnoise/stop_tracing_us", context->orig_stop_us);
+	if (retval < 0)
+		err_msg("Could not restore original osnoise stop_us\n");
+
+out_done:
+	context->stop_us = OSNOISE_OPTION_INIT_VAL;
+}
+
+/*
+ * osnoise_put_stop_us - restore original values and cleanup data
+ */
+void osnoise_put_stop_us(struct osnoise_context *context)
+{
+	osnoise_restore_stop_us(context);
+
+	if (context->orig_stop_us == OSNOISE_OPTION_INIT_VAL)
+		return;
+
+	context->orig_stop_us = OSNOISE_OPTION_INIT_VAL;
+}
+
+/*
+ * osnoise_get_stop_total_us - read and save the original "stop_tracing_total_us"
+ */
+static long long
+osnoise_get_stop_total_us(struct osnoise_context *context)
+{
+	long long stop_total_us;
+
+	if (context->stop_total_us != OSNOISE_OPTION_INIT_VAL)
+		return context->stop_total_us;
+
+	if (context->orig_stop_total_us != OSNOISE_OPTION_INIT_VAL)
+		return context->orig_stop_total_us;
+
+	stop_total_us = osnoise_read_ll_config("osnoise/stop_tracing_total_us");
+	if (stop_total_us < 0)
+		goto out_err;
+
+	context->orig_stop_total_us = stop_total_us;
+	return stop_total_us;
+
+out_err:
+	return OSNOISE_OPTION_INIT_VAL;
+}
+
+/*
+ * osnoise_set_stop_total_us - set "stop_tracing_total_us"
+ */
+int osnoise_set_stop_total_us(struct osnoise_context *context, long long stop_total_us)
+{
+	long long curr_stop_total_us = osnoise_get_stop_total_us(context);
+	int retval;
+
+	if (curr_stop_total_us == OSNOISE_OPTION_INIT_VAL)
+		return -1;
+
+	retval = osnoise_write_ll_config("osnoise/stop_tracing_total_us", stop_total_us);
+	if (retval < 0)
+		return -1;
+
+	context->stop_total_us = stop_total_us;
+
+	return 0;
+}
+
+/*
+ * osnoise_restore_stop_total_us - restore the original "stop_tracing_total_us"
+ */
+void osnoise_restore_stop_total_us(struct osnoise_context *context)
+{
+	int retval;
+
+	if (context->orig_stop_total_us == OSNOISE_OPTION_INIT_VAL)
+		return;
+
+	if (context->orig_stop_total_us == context->stop_total_us)
+		goto out_done;
+
+	retval = osnoise_write_ll_config("osnoise/stop_tracing_total_us",
+			context->orig_stop_total_us);
+	if (retval < 0)
+		err_msg("Could not restore original osnoise stop_total_us\n");
+
+out_done:
+	context->stop_total_us = OSNOISE_OPTION_INIT_VAL;
+}
+
+/*
+ * osnoise_put_stop_total_us - restore original values and cleanup data
+ */
+void osnoise_put_stop_total_us(struct osnoise_context *context)
+{
+	osnoise_restore_stop_total_us(context);
+
+	if (context->orig_stop_total_us == OSNOISE_OPTION_INIT_VAL)
+		return;
+
+	context->orig_stop_total_us = OSNOISE_OPTION_INIT_VAL;
+}
+
+/*
+ * osnoise_get_print_stack - read and save the original "print_stack"
+ */
+static long long
+osnoise_get_print_stack(struct osnoise_context *context)
+{
+	long long print_stack;
+
+	if (context->print_stack != OSNOISE_OPTION_INIT_VAL)
+		return context->print_stack;
+
+	if (context->orig_print_stack != OSNOISE_OPTION_INIT_VAL)
+		return context->orig_print_stack;
+
+	print_stack = osnoise_read_ll_config("osnoise/print_stack");
+	if (print_stack < 0)
+		goto out_err;
+
+	context->orig_print_stack = print_stack;
+	return print_stack;
+
+out_err:
+	return OSNOISE_OPTION_INIT_VAL;
+}
+
+/*
+ * osnoise_set_print_stack - set "print_stack"
+ */
+int osnoise_set_print_stack(struct osnoise_context *context, long long print_stack)
+{
+	long long curr_print_stack = osnoise_get_print_stack(context);
+	int retval;
+
+	if (curr_print_stack == OSNOISE_OPTION_INIT_VAL)
+		return -1;
+
+	retval = osnoise_write_ll_config("osnoise/print_stack", print_stack);
+	if (retval < 0)
+		return -1;
+
+	context->print_stack = print_stack;
+
+	return 0;
+}
+
+/*
+ * osnoise_restore_print_stack - restore the original "print_stack"
+ */
+void osnoise_restore_print_stack(struct osnoise_context *context)
+{
+	int retval;
+
+	if (context->orig_print_stack == OSNOISE_OPTION_INIT_VAL)
+		return;
+
+	if (context->orig_print_stack == context->print_stack)
+		goto out_done;
+
+	retval = osnoise_write_ll_config("osnoise/print_stack", context->orig_print_stack);
+	if (retval < 0)
+		err_msg("Could not restore original osnoise print_stack\n");
+
+out_done:
+	context->print_stack = OSNOISE_OPTION_INIT_VAL;
+}
+
+/*
+ * osnoise_put_print_stack - restore original values and cleanup data
+ */
+void osnoise_put_print_stack(struct osnoise_context *context)
+{
+	osnoise_restore_print_stack(context);
+
+	if (context->orig_print_stack == OSNOISE_OPTION_INIT_VAL)
+		return;
+
+	context->orig_print_stack = OSNOISE_OPTION_INIT_VAL;
+}
+
+/*
+ * enable_osnoise - enable osnoise tracer in the trace_instance
+ */
+int enable_osnoise(struct trace_instance *trace)
+{
+	return enable_tracer_by_name(trace->inst, "osnoise");
+}
+
+/*
+ * enable_timerlat - enable timerlat tracer in the trace_instance
+ */
+int enable_timerlat(struct trace_instance *trace)
+{
+	return enable_tracer_by_name(trace->inst, "timerlat");
+}
+
+enum {
+	FLAG_CONTEXT_NEWLY_CREATED	= (1 << 0),
+	FLAG_CONTEXT_DELETED		= (1 << 1),
+};
+
+/*
+ * osnoise_get_context - increase the usage of a context and return it
+ */
+int osnoise_get_context(struct osnoise_context *context)
+{
+	int ret;
+
+	if (context->flags & FLAG_CONTEXT_DELETED) {
+		ret = -1;
+	} else {
+		context->ref++;
+		ret = 0;
+	}
+
+	return ret;
+}
+
+/*
+ * osnoise_context_alloc - alloc an osnoise_context
+ *
+ * The osnoise context contains the information of the "osnoise/" configs.
+ * It is used to set and restore the config.
+ */
+struct osnoise_context *osnoise_context_alloc(void)
+{
+	struct osnoise_context *context;
+
+	context = calloc(1, sizeof(*context));
+	if (!context)
+		return NULL;
+
+	context->orig_stop_us		= OSNOISE_OPTION_INIT_VAL;
+	context->stop_us		= OSNOISE_OPTION_INIT_VAL;
+
+	context->orig_stop_total_us	= OSNOISE_OPTION_INIT_VAL;
+	context->stop_total_us		= OSNOISE_OPTION_INIT_VAL;
+
+	context->orig_print_stack	= OSNOISE_OPTION_INIT_VAL;
+	context->print_stack		= OSNOISE_OPTION_INIT_VAL;
+
+	osnoise_get_context(context);
+
+	return context;
+}
+
+/*
+ * osnoise_put_context - put the osnoise_put_context
+ *
+ * If there is no other user for the context, the original data
+ * is restored.
+ */
+void osnoise_put_context(struct osnoise_context *context)
+{
+	if (--context->ref < 1)
+		context->flags |= FLAG_CONTEXT_DELETED;
+
+	if (!(context->flags & FLAG_CONTEXT_DELETED))
+		return;
+
+	osnoise_put_cpus(context);
+	osnoise_put_runtime_period(context);
+	osnoise_put_stop_us(context);
+	osnoise_put_stop_total_us(context);
+	osnoise_put_timerlat_period_us(context);
+	osnoise_put_print_stack(context);
+
+	free(context);
+}
+
+/*
+ * osnoise_destroy_tool - disable trace, restore configs and free data
+ */
+void osnoise_destroy_tool(struct osnoise_tool *top)
+{
+	trace_instance_destroy(&top->trace);
+
+	if (top->context)
+		osnoise_put_context(top->context);
+
+	free(top);
+}
+
+/*
+ * osnoise_init_tool - init an osnoise tool
+ *
+ * It allocs data, create a context to store data and
+ * creates a new trace instance for the tool.
+ */
+struct osnoise_tool *osnoise_init_tool(char *tool_name)
+{
+	struct osnoise_tool *top;
+	int retval;
+
+	top = calloc(1, sizeof(*top));
+	if (!top)
+		return NULL;
+
+	top->context = osnoise_context_alloc();
+	if (!top->context)
+		goto out_err;
+
+	retval = trace_instance_init(&top->trace, tool_name);
+	if (retval)
+		goto out_err;
+
+	return top;
+out_err:
+	osnoise_destroy_tool(top);
+	return NULL;
+}
+
+/*
+ * osnoise_init_trace_tool - init a tracer instance to trace osnoise events
+ */
+struct osnoise_tool *osnoise_init_trace_tool(char *tracer)
+{
+	struct osnoise_tool *trace;
+	int retval;
+
+	trace = osnoise_init_tool("osnoise_trace");
+	if (!trace)
+		return NULL;
+
+	retval = tracefs_event_enable(trace->trace.inst, "osnoise", NULL);
+	if (retval < 0 && !errno) {
+		err_msg("Could not find osnoise events\n");
+		goto out_err;
+	}
+
+	retval = enable_tracer_by_name(trace->trace.inst, tracer);
+	if (retval) {
+		err_msg("Could not enable osnoiser tracer for tracing\n");
+		goto out_err;
+	}
+
+	return trace;
+out_err:
+	osnoise_destroy_tool(trace);
+	return NULL;
+}
+
+static void osnoise_usage(void)
+{
+	int i;
+
+	static const char *msg[] = {
+		"",
+		"osnoise version " VERSION,
+		"",
+		"  usage: [rtla] osnoise [MODE] ...",
+		"",
+		"  modes:",
+		"     top  - prints the summary from osnoise tracer",
+		"",
+		"if no MODE is given, the top mode is called, passing the arguments",
+		NULL,
+	};
+
+	for (i = 0; msg[i]; i++)
+		fprintf(stderr, "%s\n", msg[i]);
+	exit(1);
+}
+
+int osnoise_main(int argc, char *argv[])
+{
+	if (argc == 0)
+		goto usage;
+
+	if ((strcmp(argv[1], "-h") == 0) || (strcmp(argv[1], "--help") == 0)) {
+		osnoise_usage();
+		exit(0);
+	}
+
+usage:
+	osnoise_usage();
+	exit(1);
+}
diff --git a/tools/tracing/rtla/src/osnoise.h b/tools/tracing/rtla/src/osnoise.h
new file mode 100644
index 000000000000..1184fc66dd50
--- /dev/null
+++ b/tools/tracing/rtla/src/osnoise.h
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "trace.h"
+
+/*
+ * osnoise_context - read, store, write, restore osnoise configs.
+ */
+struct osnoise_context {
+	int			flags;
+	int			ref;
+
+	char			*curr_cpus;
+	char			*orig_cpus;
+
+	/* 0 as init value */
+	unsigned long long	orig_runtime_us;
+	unsigned long long	runtime_us;
+
+	/* 0 as init value */
+	unsigned long long	orig_period_us;
+	unsigned long long	period_us;
+
+	/* 0 as init value */
+	long long		orig_timerlat_period_us;
+	long long		timerlat_period_us;
+
+	/* -1 as init value because 0 is disabled */
+	long long		orig_stop_us;
+	long long		stop_us;
+
+	/* -1 as init value because 0 is disabled */
+	long long		orig_stop_total_us;
+	long long		stop_total_us;
+
+	/* -1 as init value because 0 is disabled */
+	long long		orig_print_stack;
+	long long		print_stack;
+};
+
+/*
+ * *_INIT_VALs are also invalid values, they are used to
+ * communicate errors.
+ */
+#define OSNOISE_OPTION_INIT_VAL	(-1)
+#define OSNOISE_TIME_INIT_VAL	(0)
+
+struct osnoise_context *osnoise_context_alloc(void);
+int osnoise_get_context(struct osnoise_context *context);
+void osnoise_put_context(struct osnoise_context *context);
+
+int osnoise_set_cpus(struct osnoise_context *context, char *cpus);
+void osnoise_restore_cpus(struct osnoise_context *context);
+
+int osnoise_set_runtime_period(struct osnoise_context *context,
+			       unsigned long long runtime,
+			       unsigned long long period);
+void osnoise_restore_runtime_period(struct osnoise_context *context);
+
+int osnoise_set_stop_us(struct osnoise_context *context,
+			long long stop_us);
+void osnoise_restore_stop_us(struct osnoise_context *context);
+
+int osnoise_set_stop_total_us(struct osnoise_context *context,
+			      long long stop_total_us);
+void osnoise_restore_stop_total_us(struct osnoise_context *context);
+
+int osnoise_set_timerlat_period_us(struct osnoise_context *context,
+				   long long timerlat_period_us);
+void osnoise_restore_timerlat_period_us(struct osnoise_context *context);
+
+void osnoise_restore_print_stack(struct osnoise_context *context);
+int osnoise_set_print_stack(struct osnoise_context *context,
+			    long long print_stack);
+
+/*
+ * osnoise_tool -  osnoise based tool definition.
+ */
+struct osnoise_tool {
+	struct trace_instance		trace;
+	struct osnoise_context		*context;
+	void				*data;
+	void				*params;
+	time_t				start_time;
+};
+
+void osnoise_destroy_tool(struct osnoise_tool *top);
+struct osnoise_tool *osnoise_init_tool(char *tool_name);
+struct osnoise_tool *osnoise_init_trace_tool(char *tracer);
+
+int osnoise_main(int argc, char **argv);
diff --git a/tools/tracing/rtla/src/rtla.c b/tools/tracing/rtla/src/rtla.c
index 5ae2664ed47d..669b9750b3b3 100644
--- a/tools/tracing/rtla/src/rtla.c
+++ b/tools/tracing/rtla/src/rtla.c
@@ -8,6 +8,8 @@
 #include <string.h>
 #include <stdio.h>
 
+#include "osnoise.h"
+
 /*
  * rtla_usage - print rtla usage
  */
@@ -22,6 +24,7 @@ static void rtla_usage(void)
 		"  usage: rtla COMMAND ...",
 		"",
 		"  commands:",
+		"     osnoise  - gives information about the operating system noise (osnoise)",
 		"",
 		NULL,
 	};
@@ -39,7 +42,14 @@ static void rtla_usage(void)
  */
 int run_command(int argc, char **argv, int start_position)
 {
+	if (strcmp(argv[start_position], "osnoise") == 0) {
+		osnoise_main(argc-start_position, &argv[start_position]);
+		goto ran;
+	}
+
 	return 0;
+ran:
+	return 1;
 }
 
 int main(int argc, char *argv[])
-- 
cgit v1.2.3


From 1eceb2fc2ca549a170d7ee7cd1fde2daeda646ac Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@kernel.org>
Date: Fri, 10 Dec 2021 19:11:23 +0100
Subject: rtla/osnoise: Add osnoise top mode

The rtla osnoise tool is an interface for the osnoise tracer. The
osnoise tracer dispatches a kernel thread per-cpu. These threads read
the time in a loop while with preemption, softirqs and IRQs enabled,
thus allowing all the sources of osnoise during its execution. The
osnoise threads take note of the entry and exit point of any source
of interferences, increasing a per-cpu interference counter. The
osnoise tracer also saves an interference counter for each source
of interference.

The rtla osnoise top mode displays information about the periodic
summary from the osnoise tracer.

One example of rtla osnoise top output is:

[root@alien ~]# rtla osnoise top -c 0-3 -d 1m -q -r 900000 -P F:1
                                         Operating System Noise
duration:   0 00:01:00 | time is in us
CPU Period       Runtime        Noise  % CPU Aval   Max Noise   Max Single          HW          NMI          IRQ      Softirq       Thread
  0 #58         52200000         1031    99.99802          91           60           0            0        52285            0          101
  1 #59         53100000            5    99.99999           5            5           0            9        53122            0           18
  2 #59         53100000            7    99.99998           7            7           0            8        53115            0           18
  3 #59         53100000         8274    99.98441         277           23           0            9        53778            0          660

"rtla osnoise top --help" works and provide information about the
available options.

Link: https://lkml.kernel.org/r/0d796993abf587ae5a170bb8415c49368d4999e1.1639158831.git.bristot@kernel.org

Cc: Tao Zhou <tao.zhou@linux.dev>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Daniel Bristot de Oliveira <bristot@kernel.org>
Cc: linux-rt-users@vger.kernel.org
Cc: linux-trace-devel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 tools/tracing/rtla/src/osnoise.c     |  16 +
 tools/tracing/rtla/src/osnoise.h     |   1 +
 tools/tracing/rtla/src/osnoise_top.c | 579 +++++++++++++++++++++++++++++++++++
 3 files changed, 596 insertions(+)
 create mode 100644 tools/tracing/rtla/src/osnoise_top.c

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/osnoise.c b/tools/tracing/rtla/src/osnoise.c
index e5254044e878..fbf4f9bfda02 100644
--- a/tools/tracing/rtla/src/osnoise.c
+++ b/tools/tracing/rtla/src/osnoise.c
@@ -844,9 +844,25 @@ int osnoise_main(int argc, char *argv[])
 	if (argc == 0)
 		goto usage;
 
+	/*
+	 * if osnoise was called without any argument, run the
+	 * default cmdline.
+	 */
+	if (argc == 1) {
+		osnoise_top_main(argc, argv);
+		exit(0);
+	}
+
 	if ((strcmp(argv[1], "-h") == 0) || (strcmp(argv[1], "--help") == 0)) {
 		osnoise_usage();
 		exit(0);
+	} else if (strncmp(argv[1], "-", 1) == 0) {
+		/* the user skipped the tool, call the default one */
+		osnoise_top_main(argc, argv);
+		exit(0);
+	} else if (strcmp(argv[1], "top") == 0) {
+		osnoise_top_main(argc-1, &argv[1]);
+		exit(0);
 	}
 
 usage:
diff --git a/tools/tracing/rtla/src/osnoise.h b/tools/tracing/rtla/src/osnoise.h
index 1184fc66dd50..a4b114ed7d71 100644
--- a/tools/tracing/rtla/src/osnoise.h
+++ b/tools/tracing/rtla/src/osnoise.h
@@ -86,4 +86,5 @@ void osnoise_destroy_tool(struct osnoise_tool *top);
 struct osnoise_tool *osnoise_init_tool(char *tool_name);
 struct osnoise_tool *osnoise_init_trace_tool(char *tracer);
 
+int osnoise_top_main(int argc, char **argv);
 int osnoise_main(int argc, char **argv);
diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c
new file mode 100644
index 000000000000..332b2ac205fc
--- /dev/null
+++ b/tools/tracing/rtla/src/osnoise_top.c
@@ -0,0 +1,579 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Red Hat Inc, Daniel Bristot de Oliveira <bristot@kernel.org>
+ */
+
+#include <getopt.h>
+#include <stdlib.h>
+#include <string.h>
+#include <signal.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <time.h>
+
+#include "osnoise.h"
+#include "utils.h"
+
+/*
+ * osnoise top parameters
+ */
+struct osnoise_top_params {
+	char			*cpus;
+	char			*monitored_cpus;
+	char			*trace_output;
+	unsigned long long	runtime;
+	unsigned long long	period;
+	long long		stop_us;
+	long long		stop_total_us;
+	int			sleep_time;
+	int			duration;
+	int			quiet;
+	int			set_sched;
+	struct sched_attr	sched_param;
+};
+
+struct osnoise_top_cpu {
+	unsigned long long	sum_runtime;
+	unsigned long long	sum_noise;
+	unsigned long long	max_noise;
+	unsigned long long	max_sample;
+
+	unsigned long long	hw_count;
+	unsigned long long	nmi_count;
+	unsigned long long	irq_count;
+	unsigned long long	softirq_count;
+	unsigned long long	thread_count;
+
+	int			sum_cycles;
+};
+
+struct osnoise_top_data {
+	struct osnoise_top_cpu	*cpu_data;
+	int			nr_cpus;
+};
+
+/*
+ * osnoise_free_top - free runtime data
+ */
+static void
+osnoise_free_top(struct osnoise_top_data *data)
+{
+	free(data->cpu_data);
+	free(data);
+}
+
+/*
+ * osnoise_alloc_histogram - alloc runtime data
+ */
+static struct osnoise_top_data *osnoise_alloc_top(int nr_cpus)
+{
+	struct osnoise_top_data *data;
+
+	data = calloc(1, sizeof(*data));
+	if (!data)
+		return NULL;
+
+	data->nr_cpus = nr_cpus;
+
+	/* one set of histograms per CPU */
+	data->cpu_data = calloc(1, sizeof(*data->cpu_data) * nr_cpus);
+	if (!data->cpu_data)
+		goto cleanup;
+
+	return data;
+
+cleanup:
+	osnoise_free_top(data);
+	return NULL;
+}
+
+/*
+ * osnoise_top_handler - this is the handler for osnoise tracer events
+ */
+static int
+osnoise_top_handler(struct trace_seq *s, struct tep_record *record,
+		    struct tep_event *event, void *context)
+{
+	struct trace_instance *trace = context;
+	struct osnoise_tool *tool;
+	unsigned long long val;
+	struct osnoise_top_cpu *cpu_data;
+	struct osnoise_top_data *data;
+	int cpu = record->cpu;
+
+	tool = container_of(trace, struct osnoise_tool, trace);
+
+	data = tool->data;
+	cpu_data = &data->cpu_data[cpu];
+
+	cpu_data->sum_cycles++;
+
+	tep_get_field_val(s, event, "runtime", record, &val, 1);
+	update_sum(&cpu_data->sum_runtime, &val);
+
+	tep_get_field_val(s, event, "noise", record, &val, 1);
+	update_max(&cpu_data->max_noise, &val);
+	update_sum(&cpu_data->sum_noise, &val);
+
+	tep_get_field_val(s, event, "max_sample", record, &val, 1);
+	update_max(&cpu_data->max_sample, &val);
+
+	tep_get_field_val(s, event, "hw_count", record, &val, 1);
+	update_sum(&cpu_data->hw_count, &val);
+
+	tep_get_field_val(s, event, "nmi_count", record, &val, 1);
+	update_sum(&cpu_data->nmi_count, &val);
+
+	tep_get_field_val(s, event, "irq_count", record, &val, 1);
+	update_sum(&cpu_data->irq_count, &val);
+
+	tep_get_field_val(s, event, "softirq_count", record, &val, 1);
+	update_sum(&cpu_data->softirq_count, &val);
+
+	tep_get_field_val(s, event, "thread_count", record, &val, 1);
+	update_sum(&cpu_data->thread_count, &val);
+
+	return 0;
+}
+
+/*
+ * osnoise_top_header - print the header of the tool output
+ */
+static void osnoise_top_header(struct osnoise_tool *top)
+{
+	struct trace_seq *s = top->trace.seq;
+	char duration[26];
+
+	get_duration(top->start_time, duration, sizeof(duration));
+
+	trace_seq_printf(s, "\033[2;37;40m");
+	trace_seq_printf(s, "                                          Operating System Noise");
+	trace_seq_printf(s, "                                     ");
+	trace_seq_printf(s, "                                     ");
+	trace_seq_printf(s, "\033[0;0;0m");
+	trace_seq_printf(s, "\n");
+
+	trace_seq_printf(s, "duration: %9s | time is in us\n", duration);
+
+	trace_seq_printf(s, "\033[2;30;47m");
+	trace_seq_printf(s, "CPU Period       Runtime ");
+	trace_seq_printf(s, "       Noise ");
+	trace_seq_printf(s, " %% CPU Aval ");
+	trace_seq_printf(s, "  Max Noise   Max Single ");
+	trace_seq_printf(s, "         HW          NMI          IRQ      Softirq       Thread");
+	trace_seq_printf(s, "\033[0;0;0m");
+	trace_seq_printf(s, "\n");
+}
+
+/*
+ * clear_terminal - clears the output terminal
+ */
+static void clear_terminal(struct trace_seq *seq)
+{
+	if (!config_debug)
+		trace_seq_printf(seq, "\033c");
+}
+
+/*
+ * osnoise_top_print - prints the output of a given CPU
+ */
+static void osnoise_top_print(struct osnoise_tool *tool, int cpu)
+{
+	struct trace_seq *s = tool->trace.seq;
+	struct osnoise_top_cpu *cpu_data;
+	struct osnoise_top_data *data;
+	int percentage;
+	int decimal;
+
+	data = tool->data;
+	cpu_data = &data->cpu_data[cpu];
+
+	if (!cpu_data->sum_runtime)
+		return;
+
+	percentage = ((cpu_data->sum_runtime - cpu_data->sum_noise) * 10000000)
+			/ cpu_data->sum_runtime;
+	decimal = percentage % 100000;
+	percentage = percentage / 100000;
+
+	trace_seq_printf(s, "%3d #%-6d %12llu ", cpu, cpu_data->sum_cycles, cpu_data->sum_runtime);
+	trace_seq_printf(s, "%12llu ", cpu_data->sum_noise);
+	trace_seq_printf(s, "  %3d.%05d", percentage, decimal);
+	trace_seq_printf(s, "%12llu %12llu", cpu_data->max_noise, cpu_data->max_sample);
+
+	trace_seq_printf(s, "%12llu ", cpu_data->hw_count);
+	trace_seq_printf(s, "%12llu ", cpu_data->nmi_count);
+	trace_seq_printf(s, "%12llu ", cpu_data->irq_count);
+	trace_seq_printf(s, "%12llu ", cpu_data->softirq_count);
+	trace_seq_printf(s, "%12llu\n", cpu_data->thread_count);
+}
+
+/*
+ * osnoise_print_stats - print data for all cpus
+ */
+static void
+osnoise_print_stats(struct osnoise_top_params *params, struct osnoise_tool *top)
+{
+	struct trace_instance *trace = &top->trace;
+	static int nr_cpus = -1;
+	int i;
+
+	if (nr_cpus == -1)
+		nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+
+	if (!params->quiet)
+		clear_terminal(trace->seq);
+
+	osnoise_top_header(top);
+
+	for (i = 0; i < nr_cpus; i++) {
+		if (params->cpus && !params->monitored_cpus[i])
+			continue;
+		osnoise_top_print(top, i);
+	}
+
+	trace_seq_do_printf(trace->seq);
+	trace_seq_reset(trace->seq);
+}
+
+/*
+ * osnoise_top_usage - prints osnoise top usage message
+ */
+void osnoise_top_usage(char *usage)
+{
+	int i;
+
+	static const char * const msg[] = {
+		"  usage: rtla osnoise [top] [-h] [-q] [-D] [-d s] [-p us] [-r us] [-s us] [-S us] [-t[=file]] \\",
+		"	  [-c cpu-list] [-P priority]",
+		"",
+		"	  -h/--help: print this menu",
+		"	  -p/--period us: osnoise period in us",
+		"	  -r/--runtime us: osnoise runtime in us",
+		"	  -s/--stop us: stop trace if a single sample is higher than the argument in us",
+		"	  -S/--stop-total us: stop trace if the total sample is higher than the argument in us",
+		"	  -c/--cpus cpu-list: list of cpus to run osnoise threads",
+		"	  -d/--duration time[s|m|h|d]: duration of the session",
+		"	  -D/--debug: print debug info",
+		"	  -t/--trace[=file]: save the stopped trace to [file|osnoise_trace.txt]",
+		"	  -q/--quiet print only a summary at the end",
+		"	  -P/--priority o:prio|r:prio|f:prio|d:runtime:period : set scheduling parameters",
+		"		o:prio - use SCHED_OTHER with prio",
+		"		r:prio - use SCHED_RR with prio",
+		"		f:prio - use SCHED_FIFO with prio",
+		"		d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period",
+		"						       in nanoseconds",
+		NULL,
+	};
+
+	if (usage)
+		fprintf(stderr, "%s\n", usage);
+
+	fprintf(stderr, "rtla osnoise top: a per-cpu summary of the OS noise (version %s)\n",
+			VERSION);
+
+	for (i = 0; msg[i]; i++)
+		fprintf(stderr, "%s\n", msg[i]);
+	exit(1);
+}
+
+/*
+ * osnoise_top_parse_args - allocs, parse and fill the cmd line parameters
+ */
+struct osnoise_top_params *osnoise_top_parse_args(int argc, char **argv)
+{
+	struct osnoise_top_params *params;
+	int retval;
+	int c;
+
+	params = calloc(1, sizeof(*params));
+	if (!params)
+		exit(1);
+
+	while (1) {
+		static struct option long_options[] = {
+			{"cpus",		required_argument,	0, 'c'},
+			{"debug",		no_argument,		0, 'D'},
+			{"duration",		required_argument,	0, 'd'},
+			{"help",		no_argument,		0, 'h'},
+			{"period",		required_argument,	0, 'p'},
+			{"priority",		required_argument,	0, 'P'},
+			{"quiet",		no_argument,		0, 'q'},
+			{"runtime",		required_argument,	0, 'r'},
+			{"stop",		required_argument,	0, 's'},
+			{"stop-total",		required_argument,	0, 'S'},
+			{"trace",		optional_argument,	0, 't'},
+			{0, 0, 0, 0}
+		};
+
+		/* getopt_long stores the option index here. */
+		int option_index = 0;
+
+		c = getopt_long(argc, argv, "c:d:Dhp:P:qr:s:S:t::",
+				 long_options, &option_index);
+
+		/* Detect the end of the options. */
+		if (c == -1)
+			break;
+
+		switch (c) {
+		case 'c':
+			retval = parse_cpu_list(optarg, &params->monitored_cpus);
+			if (retval)
+				osnoise_top_usage("\nInvalid -c cpu list\n");
+			params->cpus = optarg;
+			break;
+		case 'D':
+			config_debug = 1;
+			break;
+		case 'd':
+			params->duration = parse_seconds_duration(optarg);
+			if (!params->duration)
+				osnoise_top_usage("Invalid -D duration\n");
+			break;
+		case 'h':
+		case '?':
+			osnoise_top_usage(NULL);
+			break;
+		case 'p':
+			params->period = get_llong_from_str(optarg);
+			if (params->period > 10000000)
+				osnoise_top_usage("Period longer than 10 s\n");
+			break;
+		case 'P':
+			retval = parse_prio(optarg, &params->sched_param);
+			if (retval == -1)
+				osnoise_top_usage("Invalid -P priority");
+			params->set_sched = 1;
+			break;
+		case 'q':
+			params->quiet = 1;
+			break;
+		case 'r':
+			params->runtime = get_llong_from_str(optarg);
+			if (params->runtime < 100)
+				osnoise_top_usage("Runtime shorter than 100 us\n");
+			break;
+		case 's':
+			params->stop_us = get_llong_from_str(optarg);
+			break;
+		case 'S':
+			params->stop_total_us = get_llong_from_str(optarg);
+			break;
+		case 't':
+			if (optarg)
+				/* skip = */
+				params->trace_output = &optarg[1];
+			else
+				params->trace_output = "osnoise_trace.txt";
+			break;
+		default:
+			osnoise_top_usage("Invalid option");
+		}
+	}
+
+	if (geteuid()) {
+		err_msg("osnoise needs root permission\n");
+		exit(EXIT_FAILURE);
+	}
+
+	return params;
+}
+
+/*
+ * osnoise_top_apply_config - apply the top configs to the initialized tool
+ */
+static int
+osnoise_top_apply_config(struct osnoise_tool *tool, struct osnoise_top_params *params)
+{
+	int retval;
+
+	if (!params->sleep_time)
+		params->sleep_time = 1;
+
+	if (params->cpus) {
+		retval = osnoise_set_cpus(tool->context, params->cpus);
+		if (retval) {
+			err_msg("Failed to apply CPUs config\n");
+			goto out_err;
+		}
+	}
+
+	if (params->runtime || params->period) {
+		retval = osnoise_set_runtime_period(tool->context,
+						    params->runtime,
+						    params->period);
+		if (retval) {
+			err_msg("Failed to set runtime and/or period\n");
+			goto out_err;
+		}
+	}
+
+	if (params->stop_us) {
+		retval = osnoise_set_stop_us(tool->context, params->stop_us);
+		if (retval) {
+			err_msg("Failed to set stop us\n");
+			goto out_err;
+		}
+	}
+
+	if (params->stop_total_us) {
+		retval = osnoise_set_stop_total_us(tool->context, params->stop_total_us);
+		if (retval) {
+			err_msg("Failed to set stop total us\n");
+			goto out_err;
+		}
+	}
+
+	return 0;
+
+out_err:
+	return -1;
+}
+
+/*
+ * osnoise_init_top - initialize a osnoise top tool with parameters
+ */
+struct osnoise_tool *osnoise_init_top(struct osnoise_top_params *params)
+{
+	struct osnoise_tool *tool;
+	int nr_cpus;
+
+	nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+
+	tool = osnoise_init_tool("osnoise_top");
+	if (!tool)
+		return NULL;
+
+	tool->data = osnoise_alloc_top(nr_cpus);
+	if (!tool->data)
+		goto out_err;
+
+	tool->params = params;
+
+	tep_register_event_handler(tool->trace.tep, -1, "ftrace", "osnoise",
+				   osnoise_top_handler, NULL);
+
+	return tool;
+
+out_err:
+	osnoise_free_top(tool->data);
+	osnoise_destroy_tool(tool);
+	return NULL;
+}
+
+static int stop_tracing;
+static void stop_top(int sig)
+{
+	stop_tracing = 1;
+}
+
+/*
+ * osnoise_top_set_signals - handles the signal to stop the tool
+ */
+static void osnoise_top_set_signals(struct osnoise_top_params *params)
+{
+	signal(SIGINT, stop_top);
+	if (params->duration) {
+		signal(SIGALRM, stop_top);
+		alarm(params->duration);
+	}
+}
+
+int osnoise_top_main(int argc, char **argv)
+{
+	struct osnoise_top_params *params;
+	struct trace_instance *trace;
+	struct osnoise_tool *record;
+	struct osnoise_tool *tool;
+	int return_value = 1;
+	int retval;
+
+	params = osnoise_top_parse_args(argc, argv);
+	if (!params)
+		exit(1);
+
+	tool = osnoise_init_top(params);
+	if (!tool) {
+		err_msg("Could not init osnoise top\n");
+		goto out_exit;
+	}
+
+	retval = osnoise_top_apply_config(tool, params);
+	if (retval) {
+		err_msg("Could not apply config\n");
+		goto out_top;
+	}
+
+	trace = &tool->trace;
+
+	retval = enable_osnoise(trace);
+	if (retval) {
+		err_msg("Failed to enable osnoise tracer\n");
+		goto out_top;
+	}
+
+	if (params->set_sched) {
+		retval = set_comm_sched_attr("osnoise/", &params->sched_param);
+		if (retval) {
+			err_msg("Failed to set sched parameters\n");
+			goto out_top;
+		}
+	}
+
+	trace_instance_start(trace);
+
+	if (params->trace_output) {
+		record = osnoise_init_trace_tool("osnoise");
+		if (!record) {
+			err_msg("Failed to enable the trace instance\n");
+			goto out_top;
+		}
+		trace_instance_start(&record->trace);
+	}
+
+	tool->start_time = time(NULL);
+	osnoise_top_set_signals(params);
+
+	do {
+		sleep(params->sleep_time);
+
+		retval = tracefs_iterate_raw_events(trace->tep,
+						    trace->inst,
+						    NULL,
+						    0,
+						    collect_registered_events,
+						    trace);
+		if (retval < 0) {
+			err_msg("Error iterating on events\n");
+			goto out_top;
+		}
+
+		if (!params->quiet)
+			osnoise_print_stats(params, tool);
+
+		if (!tracefs_trace_is_on(trace->inst))
+			break;
+
+	} while (!stop_tracing);
+
+	osnoise_print_stats(params, tool);
+
+	return_value = 0;
+
+	if (!tracefs_trace_is_on(trace->inst)) {
+		printf("osnoise hit stop tracing\n");
+		if (params->trace_output) {
+			printf("  Saving trace to %s\n", params->trace_output);
+			save_trace_to_file(record->trace.inst, params->trace_output);
+		}
+	}
+
+out_top:
+	osnoise_free_top(tool->data);
+	osnoise_destroy_tool(tool);
+	if (params->trace_output)
+		osnoise_destroy_tool(record);
+out_exit:
+	exit(return_value);
+}
-- 
cgit v1.2.3


From 829a6c0b5698c7d51aa2bcd16963fcd9de295476 Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@kernel.org>
Date: Fri, 10 Dec 2021 19:11:24 +0100
Subject: rtla/osnoise: Add the hist mode

The rtla osnoise hist tool collects all osnoise:sample_threshold
occurrence in a histogram, displaying the results in a user-friendly
way. The tool also allows many configurations of the osnoise tracer
and the collection of the tracer output.

Here is one example of the rtla osnoise hist tool output:
  ---------- %< ----------
 [root@f34 ~]# rtla osnoise hist --bucket-size 10 --entries 100 -c 0-8 -d 1M -r 9000 -P F:1
 # RTLA osnoise histogram
 # Time unit is microseconds (us)
 # Duration:   0 00:01:00
 Index   CPU-000   CPU-001   CPU-002   CPU-003   CPU-004   CPU-005   CPU-006   CPU-007   CPU-008
 0           430       434       352       455       440       463       467       436       484
 10           88        88        92       141       120       100       126       166       100
 20           19         7        12        22         8         8        13        13        16
 30            6         0         2         0         1         2         2         1         0
 50            0         0         0         0         0         0         1         0         0
 over:         0         0         0         0         0         0         0         0         0
 count:      543       529       458       618       569       573       609       616       600
 min:          0         0         0         0         0         0         0         0         0
 avg:          0         0         0         0         0         0         0         0         0
 max:         30        20        30        20        30        30        50        30        20
  ---------- >% ----------

Running
 - rtla osnoise hist --help

provides information about the available options.

Link: https://lkml.kernel.org/r/c68060544de89b8b62510ed91c7369f162eb465b.1639158831.git.bristot@kernel.org

Cc: Tao Zhou <tao.zhou@linux.dev>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Daniel Bristot de Oliveira <bristot@kernel.org>
Cc: linux-rt-users@vger.kernel.org
Cc: linux-trace-devel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 tools/tracing/rtla/src/osnoise.c      |   6 +-
 tools/tracing/rtla/src/osnoise.h      |   1 +
 tools/tracing/rtla/src/osnoise_hist.c | 801 ++++++++++++++++++++++++++++++++++
 3 files changed, 807 insertions(+), 1 deletion(-)
 create mode 100644 tools/tracing/rtla/src/osnoise_hist.c

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/osnoise.c b/tools/tracing/rtla/src/osnoise.c
index fbf4f9bfda02..7b73d1eccd0e 100644
--- a/tools/tracing/rtla/src/osnoise.c
+++ b/tools/tracing/rtla/src/osnoise.c
@@ -828,7 +828,8 @@ static void osnoise_usage(void)
 		"  usage: [rtla] osnoise [MODE] ...",
 		"",
 		"  modes:",
-		"     top  - prints the summary from osnoise tracer",
+		"     top   - prints the summary from osnoise tracer",
+		"     hist  - prints a histogram of osnoise samples",
 		"",
 		"if no MODE is given, the top mode is called, passing the arguments",
 		NULL,
@@ -863,6 +864,9 @@ int osnoise_main(int argc, char *argv[])
 	} else if (strcmp(argv[1], "top") == 0) {
 		osnoise_top_main(argc-1, &argv[1]);
 		exit(0);
+	} else if (strcmp(argv[1], "hist") == 0) {
+		osnoise_hist_main(argc-1, &argv[1]);
+		exit(0);
 	}
 
 usage:
diff --git a/tools/tracing/rtla/src/osnoise.h b/tools/tracing/rtla/src/osnoise.h
index a4b114ed7d71..9e4b2e2a4559 100644
--- a/tools/tracing/rtla/src/osnoise.h
+++ b/tools/tracing/rtla/src/osnoise.h
@@ -86,5 +86,6 @@ void osnoise_destroy_tool(struct osnoise_tool *top);
 struct osnoise_tool *osnoise_init_tool(char *tool_name);
 struct osnoise_tool *osnoise_init_trace_tool(char *tracer);
 
+int osnoise_hist_main(int argc, char *argv[]);
 int osnoise_top_main(int argc, char **argv);
 int osnoise_main(int argc, char **argv);
diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c
new file mode 100644
index 000000000000..180fcbe423cd
--- /dev/null
+++ b/tools/tracing/rtla/src/osnoise_hist.c
@@ -0,0 +1,801 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Red Hat Inc, Daniel Bristot de Oliveira <bristot@kernel.org>
+ */
+
+#include <getopt.h>
+#include <stdlib.h>
+#include <string.h>
+#include <signal.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stdio.h>
+#include <time.h>
+
+#include "utils.h"
+#include "osnoise.h"
+
+struct osnoise_hist_params {
+	char			*cpus;
+	char			*monitored_cpus;
+	char			*trace_output;
+	unsigned long long	runtime;
+	unsigned long long	period;
+	long long		stop_us;
+	long long		stop_total_us;
+	int			sleep_time;
+	int			duration;
+	int			set_sched;
+	int			output_divisor;
+	struct sched_attr	sched_param;
+
+	char			no_header;
+	char			no_summary;
+	char			no_index;
+	char			with_zeros;
+	int			bucket_size;
+	int			entries;
+};
+
+struct osnoise_hist_cpu {
+	int			*samples;
+	int			count;
+
+	unsigned long long	min_sample;
+	unsigned long long	sum_sample;
+	unsigned long long	max_sample;
+
+};
+
+struct osnoise_hist_data {
+	struct tracefs_hist	*trace_hist;
+	struct osnoise_hist_cpu	*hist;
+	int			entries;
+	int			bucket_size;
+	int			nr_cpus;
+};
+
+/*
+ * osnoise_free_histogram - free runtime data
+ */
+static void
+osnoise_free_histogram(struct osnoise_hist_data *data)
+{
+	int cpu;
+
+	/* one histogram for IRQ and one for thread, per CPU */
+	for (cpu = 0; cpu < data->nr_cpus; cpu++) {
+		if (data->hist[cpu].samples)
+			free(data->hist[cpu].samples);
+	}
+
+	/* one set of histograms per CPU */
+	if (data->hist)
+		free(data->hist);
+
+	free(data);
+}
+
+/*
+ * osnoise_alloc_histogram - alloc runtime data
+ */
+static struct osnoise_hist_data
+*osnoise_alloc_histogram(int nr_cpus, int entries, int bucket_size)
+{
+	struct osnoise_hist_data *data;
+	int cpu;
+
+	data = calloc(1, sizeof(*data));
+	if (!data)
+		return NULL;
+
+	data->entries = entries;
+	data->bucket_size = bucket_size;
+	data->nr_cpus = nr_cpus;
+
+	data->hist = calloc(1, sizeof(*data->hist) * nr_cpus);
+	if (!data->hist)
+		goto cleanup;
+
+	for (cpu = 0; cpu < nr_cpus; cpu++) {
+		data->hist[cpu].samples = calloc(1, sizeof(*data->hist->samples) * (entries + 1));
+		if (!data->hist[cpu].samples)
+			goto cleanup;
+	}
+
+	/* set the min to max */
+	for (cpu = 0; cpu < nr_cpus; cpu++)
+		data->hist[cpu].min_sample = ~0;
+
+	return data;
+
+cleanup:
+	osnoise_free_histogram(data);
+	return NULL;
+}
+
+static void osnoise_hist_update_multiple(struct osnoise_tool *tool, int cpu,
+					 unsigned long long duration, int count)
+{
+	struct osnoise_hist_params *params = tool->params;
+	struct osnoise_hist_data *data = tool->data;
+	int entries = data->entries;
+	int bucket;
+	int *hist;
+
+	if (params->output_divisor)
+		duration = duration / params->output_divisor;
+
+	if (data->bucket_size)
+		bucket = duration / data->bucket_size;
+
+	hist = data->hist[cpu].samples;
+	data->hist[cpu].count += count;
+	update_min(&data->hist[cpu].min_sample, &duration);
+	update_sum(&data->hist[cpu].sum_sample, &duration);
+	update_max(&data->hist[cpu].max_sample, &duration);
+
+	if (bucket < entries)
+		hist[bucket] += count;
+	else
+		hist[entries] += count;
+}
+
+/*
+ * osnoise_destroy_trace_hist - disable events used to collect histogram
+ */
+static void osnoise_destroy_trace_hist(struct osnoise_tool *tool)
+{
+	struct osnoise_hist_data *data = tool->data;
+
+	tracefs_hist_pause(tool->trace.inst, data->trace_hist);
+	tracefs_hist_destroy(tool->trace.inst, data->trace_hist);
+}
+
+/*
+ * osnoise_init_trace_hist - enable events used to collect histogram
+ */
+static int osnoise_init_trace_hist(struct osnoise_tool *tool)
+{
+	struct osnoise_hist_params *params = tool->params;
+	struct osnoise_hist_data *data = tool->data;
+	int bucket_size;
+	char buff[128];
+	int retval = 0;
+
+	/*
+	 * Set the size of the bucket.
+	 */
+	bucket_size = params->output_divisor * params->bucket_size;
+	snprintf(buff, sizeof(buff), "duration.buckets=%d", bucket_size);
+
+	data->trace_hist = tracefs_hist_alloc(tool->trace.tep, "osnoise", "sample_threshold",
+			buff, TRACEFS_HIST_KEY_NORMAL);
+	if (!data->trace_hist)
+		return 1;
+
+	retval = tracefs_hist_add_key(data->trace_hist, "cpu", 0);
+	if (retval)
+		goto out_err;
+
+	retval = tracefs_hist_start(tool->trace.inst, data->trace_hist);
+	if (retval)
+		goto out_err;
+
+	return 0;
+
+out_err:
+	osnoise_destroy_trace_hist(tool);
+	return 1;
+}
+
+/*
+ * osnoise_read_trace_hist - parse histogram file and file osnoise histogram
+ */
+static void osnoise_read_trace_hist(struct osnoise_tool *tool)
+{
+	struct osnoise_hist_data *data = tool->data;
+	long long cpu, counter, duration;
+	char *content, *position;
+
+	tracefs_hist_pause(tool->trace.inst, data->trace_hist);
+
+	content = tracefs_event_file_read(tool->trace.inst, "osnoise",
+					  "sample_threshold",
+					  "hist", NULL);
+	if (!content)
+		return;
+
+	position = content;
+	while (true) {
+		position = strstr(position, "duration: ~");
+		if (!position)
+			break;
+		position += strlen("duration: ~");
+		duration = get_llong_from_str(position);
+		if (duration == -1)
+			err_msg("error reading duration from histogram\n");
+
+		position = strstr(position, "cpu:");
+		if (!position)
+			break;
+		position += strlen("cpu: ");
+		cpu = get_llong_from_str(position);
+		if (cpu == -1)
+			err_msg("error reading cpu from histogram\n");
+
+		position = strstr(position, "hitcount:");
+		if (!position)
+			break;
+		position += strlen("hitcount: ");
+		counter = get_llong_from_str(position);
+		if (counter == -1)
+			err_msg("error reading counter from histogram\n");
+
+		osnoise_hist_update_multiple(tool, cpu, duration, counter);
+	}
+	free(content);
+}
+
+/*
+ * osnoise_hist_header - print the header of the tracer to the output
+ */
+static void osnoise_hist_header(struct osnoise_tool *tool)
+{
+	struct osnoise_hist_params *params = tool->params;
+	struct osnoise_hist_data *data = tool->data;
+	struct trace_seq *s = tool->trace.seq;
+	char duration[26];
+	int cpu;
+
+	if (params->no_header)
+		return;
+
+	get_duration(tool->start_time, duration, sizeof(duration));
+	trace_seq_printf(s, "# RTLA osnoise histogram\n");
+	trace_seq_printf(s, "# Time unit is %s (%s)\n",
+			params->output_divisor == 1 ? "nanoseconds" : "microseconds",
+			params->output_divisor == 1 ? "ns" : "us");
+
+	trace_seq_printf(s, "# Duration: %s\n", duration);
+
+	if (!params->no_index)
+		trace_seq_printf(s, "Index");
+
+	for (cpu = 0; cpu < data->nr_cpus; cpu++) {
+		if (params->cpus && !params->monitored_cpus[cpu])
+			continue;
+
+		if (!data->hist[cpu].count)
+			continue;
+
+		trace_seq_printf(s, "   CPU-%03d", cpu);
+	}
+	trace_seq_printf(s, "\n");
+
+	trace_seq_do_printf(s);
+	trace_seq_reset(s);
+}
+
+/*
+ * osnoise_print_summary - print the summary of the hist data to the output
+ */
+static void
+osnoise_print_summary(struct osnoise_hist_params *params,
+		       struct trace_instance *trace,
+		       struct osnoise_hist_data *data)
+{
+	int cpu;
+
+	if (params->no_summary)
+		return;
+
+	if (!params->no_index)
+		trace_seq_printf(trace->seq, "count:");
+
+	for (cpu = 0; cpu < data->nr_cpus; cpu++) {
+		if (params->cpus && !params->monitored_cpus[cpu])
+			continue;
+
+		if (!data->hist[cpu].count)
+			continue;
+
+		trace_seq_printf(trace->seq, "%9d ", data->hist[cpu].count);
+	}
+	trace_seq_printf(trace->seq, "\n");
+
+	if (!params->no_index)
+		trace_seq_printf(trace->seq, "min:  ");
+
+	for (cpu = 0; cpu < data->nr_cpus; cpu++) {
+		if (params->cpus && !params->monitored_cpus[cpu])
+			continue;
+
+		if (!data->hist[cpu].count)
+			continue;
+
+		trace_seq_printf(trace->seq, "%9llu ",	data->hist[cpu].min_sample);
+
+	}
+	trace_seq_printf(trace->seq, "\n");
+
+	if (!params->no_index)
+		trace_seq_printf(trace->seq, "avg:  ");
+
+	for (cpu = 0; cpu < data->nr_cpus; cpu++) {
+		if (params->cpus && !params->monitored_cpus[cpu])
+			continue;
+
+		if (!data->hist[cpu].count)
+			continue;
+
+		if (data->hist[cpu].count)
+			trace_seq_printf(trace->seq, "%9llu ",
+					data->hist[cpu].sum_sample / data->hist[cpu].count);
+		else
+			trace_seq_printf(trace->seq, "        - ");
+	}
+	trace_seq_printf(trace->seq, "\n");
+
+	if (!params->no_index)
+		trace_seq_printf(trace->seq, "max:  ");
+
+	for (cpu = 0; cpu < data->nr_cpus; cpu++) {
+		if (params->cpus && !params->monitored_cpus[cpu])
+			continue;
+
+		if (!data->hist[cpu].count)
+			continue;
+
+		trace_seq_printf(trace->seq, "%9llu ", data->hist[cpu].max_sample);
+
+	}
+	trace_seq_printf(trace->seq, "\n");
+	trace_seq_do_printf(trace->seq);
+	trace_seq_reset(trace->seq);
+}
+
+/*
+ * osnoise_print_stats - print data for all CPUs
+ */
+static void
+osnoise_print_stats(struct osnoise_hist_params *params, struct osnoise_tool *tool)
+{
+	struct osnoise_hist_data *data = tool->data;
+	struct trace_instance *trace = &tool->trace;
+	int bucket, cpu;
+	int total;
+
+	osnoise_hist_header(tool);
+
+	for (bucket = 0; bucket < data->entries; bucket++) {
+		total = 0;
+
+		if (!params->no_index)
+			trace_seq_printf(trace->seq, "%-6d",
+					 bucket * data->bucket_size);
+
+		for (cpu = 0; cpu < data->nr_cpus; cpu++) {
+			if (params->cpus && !params->monitored_cpus[cpu])
+				continue;
+
+			if (!data->hist[cpu].count)
+				continue;
+
+			total += data->hist[cpu].samples[bucket];
+			trace_seq_printf(trace->seq, "%9d ", data->hist[cpu].samples[bucket]);
+		}
+
+		if (total == 0 && !params->with_zeros) {
+			trace_seq_reset(trace->seq);
+			continue;
+		}
+
+		trace_seq_printf(trace->seq, "\n");
+		trace_seq_do_printf(trace->seq);
+		trace_seq_reset(trace->seq);
+	}
+
+	if (!params->no_index)
+		trace_seq_printf(trace->seq, "over: ");
+
+	for (cpu = 0; cpu < data->nr_cpus; cpu++) {
+		if (params->cpus && !params->monitored_cpus[cpu])
+			continue;
+
+		if (!data->hist[cpu].count)
+			continue;
+
+		trace_seq_printf(trace->seq, "%9d ",
+				 data->hist[cpu].samples[data->entries]);
+	}
+	trace_seq_printf(trace->seq, "\n");
+	trace_seq_do_printf(trace->seq);
+	trace_seq_reset(trace->seq);
+
+	osnoise_print_summary(params, trace, data);
+}
+
+/*
+ * osnoise_hist_usage - prints osnoise hist usage message
+ */
+static void osnoise_hist_usage(char *usage)
+{
+	int i;
+
+	static const char * const msg[] = {
+		"",
+		"  usage: rtla osnoise hist [-h] [-D] [-d s] [-p us] [-r us] [-s us] [-S us] [-t[=file]] \\",
+		"	  [-c cpu-list] [-P priority] [-b N] [-e N] [--no-header] [--no-summary] \\",
+		"	  [--no-index] [--with-zeros]",
+		"",
+		"	  -h/--help: print this menu",
+		"	  -p/--period us: osnoise period in us",
+		"	  -r/--runtime us: osnoise runtime in us",
+		"	  -s/--stop us: stop trace if a single sample is higher than the argument in us",
+		"	  -S/--stop-total us: stop trace if the total sample is higher than the argument in us",
+		"	  -c/--cpus cpu-list: list of cpus to run osnoise threads",
+		"	  -d/--duration time[s|m|h|d]: duration of the session",
+		"	  -D/--debug: print debug info",
+		"	  -t/--trace[=file]: save the stopped trace to [file|osnoise_trace.txt]",
+		"	  -b/--bucket-size N: set the histogram bucket size (default 1)",
+		"	  -e/--entries N: set the number of entries of the histogram (default 256)",
+		"	     --no-header: do not print header",
+		"	     --no-summary: do not print summary",
+		"	     --no-index: do not print index",
+		"	     --with-zeros: print zero only entries",
+		"	  -P/--priority o:prio|r:prio|f:prio|d:runtime:period: set scheduling parameters",
+		"		o:prio - use SCHED_OTHER with prio",
+		"		r:prio - use SCHED_RR with prio",
+		"		f:prio - use SCHED_FIFO with prio",
+		"		d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period",
+		"						       in nanoseconds",
+		NULL,
+	};
+
+	if (usage)
+		fprintf(stderr, "%s\n", usage);
+
+	fprintf(stderr, "rtla osnoise hist: a per-cpu histogram of the OS noise (version %s)\n",
+			VERSION);
+
+	for (i = 0; msg[i]; i++)
+		fprintf(stderr, "%s\n", msg[i]);
+	exit(1);
+}
+
+/*
+ * osnoise_hist_parse_args - allocs, parse and fill the cmd line parameters
+ */
+static struct osnoise_hist_params
+*osnoise_hist_parse_args(int argc, char *argv[])
+{
+	struct osnoise_hist_params *params;
+	int retval;
+	int c;
+
+	params = calloc(1, sizeof(*params));
+	if (!params)
+		exit(1);
+
+	/* display data in microseconds */
+	params->output_divisor = 1000;
+	params->bucket_size = 1;
+	params->entries = 256;
+
+	while (1) {
+		static struct option long_options[] = {
+			{"bucket-size",		required_argument,	0, 'b'},
+			{"entries",		required_argument,	0, 'e'},
+			{"cpus",		required_argument,	0, 'c'},
+			{"debug",		no_argument,		0, 'D'},
+			{"duration",		required_argument,	0, 'd'},
+			{"help",		no_argument,		0, 'h'},
+			{"period",		required_argument,	0, 'p'},
+			{"priority",		required_argument,	0, 'P'},
+			{"runtime",		required_argument,	0, 'r'},
+			{"stop",		required_argument,	0, 's'},
+			{"stop-total",		required_argument,	0, 'S'},
+			{"trace",		optional_argument,	0, 't'},
+			{"no-header",		no_argument,		0, '0'},
+			{"no-summary",		no_argument,		0, '1'},
+			{"no-index",		no_argument,		0, '2'},
+			{"with-zeros",		no_argument,		0, '3'},
+			{0, 0, 0, 0}
+		};
+
+		/* getopt_long stores the option index here. */
+		int option_index = 0;
+
+		c = getopt_long(argc, argv, "c:b:d:e:Dhp:P:r:s:S:t::0123",
+				 long_options, &option_index);
+
+		/* detect the end of the options. */
+		if (c == -1)
+			break;
+
+		switch (c) {
+		case 'b':
+			params->bucket_size = get_llong_from_str(optarg);
+			if ((params->bucket_size == 0) || (params->bucket_size >= 1000000))
+				osnoise_hist_usage("Bucket size needs to be > 0 and <= 1000000\n");
+			break;
+		case 'c':
+			retval = parse_cpu_list(optarg, &params->monitored_cpus);
+			if (retval)
+				osnoise_hist_usage("\nInvalid -c cpu list\n");
+			params->cpus = optarg;
+			break;
+		case 'D':
+			config_debug = 1;
+			break;
+		case 'd':
+			params->duration = parse_seconds_duration(optarg);
+			if (!params->duration)
+				osnoise_hist_usage("Invalid -D duration\n");
+			break;
+		case 'e':
+			params->entries = get_llong_from_str(optarg);
+			if ((params->entries < 10) || (params->entries > 9999999))
+				osnoise_hist_usage("Entries must be > 10 and < 9999999\n");
+			break;
+		case 'h':
+		case '?':
+			osnoise_hist_usage(NULL);
+			break;
+		case 'p':
+			params->period = get_llong_from_str(optarg);
+			if (params->period > 10000000)
+				osnoise_hist_usage("Period longer than 10 s\n");
+			break;
+		case 'P':
+			retval = parse_prio(optarg, &params->sched_param);
+			if (retval == -1)
+				osnoise_hist_usage("Invalid -P priority");
+			params->set_sched = 1;
+			break;
+		case 'r':
+			params->runtime = get_llong_from_str(optarg);
+			if (params->runtime < 100)
+				osnoise_hist_usage("Runtime shorter than 100 us\n");
+			break;
+		case 's':
+			params->stop_us = get_llong_from_str(optarg);
+			break;
+		case 'S':
+			params->stop_total_us = get_llong_from_str(optarg);
+			break;
+		case 't':
+			if (optarg)
+				/* skip = */
+				params->trace_output = &optarg[1];
+			else
+				params->trace_output = "osnoise_trace.txt";
+			break;
+		case '0': /* no header */
+			params->no_header = 1;
+			break;
+		case '1': /* no summary */
+			params->no_summary = 1;
+			break;
+		case '2': /* no index */
+			params->no_index = 1;
+			break;
+		case '3': /* with zeros */
+			params->with_zeros = 1;
+			break;
+		default:
+			osnoise_hist_usage("Invalid option");
+		}
+	}
+
+	if (geteuid()) {
+		err_msg("rtla needs root permission\n");
+		exit(EXIT_FAILURE);
+	}
+
+	if (params->no_index && !params->with_zeros)
+		osnoise_hist_usage("no-index set and with-zeros not set - it does not make sense");
+
+	return params;
+}
+
+/*
+ * osnoise_hist_apply_config - apply the hist configs to the initialized tool
+ */
+static int
+osnoise_hist_apply_config(struct osnoise_tool *tool, struct osnoise_hist_params *params)
+{
+	int retval;
+
+	if (!params->sleep_time)
+		params->sleep_time = 1;
+
+	if (params->cpus) {
+		retval = osnoise_set_cpus(tool->context, params->cpus);
+		if (retval) {
+			err_msg("Failed to apply CPUs config\n");
+			goto out_err;
+		}
+	}
+
+	if (params->runtime || params->period) {
+		retval = osnoise_set_runtime_period(tool->context,
+						    params->runtime,
+						    params->period);
+		if (retval) {
+			err_msg("Failed to set runtime and/or period\n");
+			goto out_err;
+		}
+	}
+
+	if (params->stop_us) {
+		retval = osnoise_set_stop_us(tool->context, params->stop_us);
+		if (retval) {
+			err_msg("Failed to set stop us\n");
+			goto out_err;
+		}
+	}
+
+	if (params->stop_total_us) {
+		retval = osnoise_set_stop_total_us(tool->context, params->stop_total_us);
+		if (retval) {
+			err_msg("Failed to set stop total us\n");
+			goto out_err;
+		}
+	}
+
+	return 0;
+
+out_err:
+	return -1;
+}
+
+/*
+ * osnoise_init_hist - initialize a osnoise hist tool with parameters
+ */
+static struct osnoise_tool
+*osnoise_init_hist(struct osnoise_hist_params *params)
+{
+	struct osnoise_tool *tool;
+	int nr_cpus;
+
+	nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+
+	tool = osnoise_init_tool("osnoise_hist");
+	if (!tool)
+		return NULL;
+
+	tool->data = osnoise_alloc_histogram(nr_cpus, params->entries, params->bucket_size);
+	if (!tool->data)
+		goto out_err;
+
+	tool->params = params;
+
+	return tool;
+
+out_err:
+	osnoise_destroy_tool(tool);
+	return NULL;
+}
+
+static int stop_tracing;
+static void stop_hist(int sig)
+{
+	stop_tracing = 1;
+}
+
+/*
+ * osnoise_hist_set_signals - handles the signal to stop the tool
+ */
+static void
+osnoise_hist_set_signals(struct osnoise_hist_params *params)
+{
+	signal(SIGINT, stop_hist);
+	if (params->duration) {
+		signal(SIGALRM, stop_hist);
+		alarm(params->duration);
+	}
+}
+
+int osnoise_hist_main(int argc, char *argv[])
+{
+	struct osnoise_hist_params *params;
+	struct trace_instance *trace;
+	struct osnoise_tool *record;
+	struct osnoise_tool *tool;
+	int return_value = 1;
+	int retval;
+
+	params = osnoise_hist_parse_args(argc, argv);
+	if (!params)
+		exit(1);
+
+	tool = osnoise_init_hist(params);
+	if (!tool) {
+		err_msg("Could not init osnoise hist\n");
+		goto out_exit;
+	}
+
+	retval = osnoise_hist_apply_config(tool, params);
+	if (retval) {
+		err_msg("Could not apply config\n");
+		goto out_destroy;
+	}
+
+	trace = &tool->trace;
+
+	retval = enable_osnoise(trace);
+	if (retval) {
+		err_msg("Failed to enable osnoise tracer\n");
+		goto out_destroy;
+	}
+
+	retval = osnoise_init_trace_hist(tool);
+	if (retval)
+		goto out_destroy;
+
+	if (params->set_sched) {
+		retval = set_comm_sched_attr("osnoise/", &params->sched_param);
+		if (retval) {
+			err_msg("Failed to set sched parameters\n");
+			goto out_hist;
+		}
+	}
+
+	trace_instance_start(trace);
+
+	if (params->trace_output) {
+		record = osnoise_init_trace_tool("osnoise");
+		if (!record) {
+			err_msg("Failed to enable the trace instance\n");
+			goto out_hist;
+		}
+		trace_instance_start(&record->trace);
+	}
+
+	tool->start_time = time(NULL);
+	osnoise_hist_set_signals(params);
+
+	while (!stop_tracing) {
+		sleep(params->sleep_time);
+
+		retval = tracefs_iterate_raw_events(trace->tep,
+						    trace->inst,
+						    NULL,
+						    0,
+						    collect_registered_events,
+						    trace);
+		if (retval < 0) {
+			err_msg("Error iterating on events\n");
+			goto out_hist;
+		}
+
+		if (!tracefs_trace_is_on(trace->inst))
+			break;
+	};
+
+	osnoise_read_trace_hist(tool);
+
+	osnoise_print_stats(params, tool);
+
+	return_value = 0;
+
+	if (!tracefs_trace_is_on(trace->inst)) {
+		printf("rtla timelat hit stop tracing\n");
+		if (params->trace_output) {
+			printf("  Saving trace to %s\n", params->trace_output);
+			save_trace_to_file(record->trace.inst, params->trace_output);
+		}
+	}
+
+out_hist:
+	osnoise_free_histogram(tool->data);
+out_destroy:
+	osnoise_destroy_tool(tool);
+	if (params->trace_output)
+		osnoise_destroy_tool(record);
+	free(params);
+out_exit:
+	exit(return_value);
+}
-- 
cgit v1.2.3


From a828cd18bc4ad72ff6d64c0dd23dc814a6741fed Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@kernel.org>
Date: Fri, 10 Dec 2021 19:11:25 +0100
Subject: rtla: Add timerlat tool and timelart top mode

The rtla timerlat tool is an interface for the timerlat tracer.
The timerlat tracer dispatches a kernel thread per-cpu. These threads set a
periodic timer to wake themselves up and go back to sleep. After the
wakeup, they collect and generate useful information for the debugging of
operating system timer latency.

The timerlat tracer outputs information in two ways. It periodically
prints the timer latency at the timer IRQ handler and the Thread handler.
It also provides information for each noise via the osnoise tracepoints.

The rtla timerlat top mode displays a summary of the periodic output from
the timerlat tracer.

Here is one example of the rtla timerlat tool output:
 ---------- %< ----------
[root@alien ~]# rtla timerlat top -c 0-3 -d 1m
                                     Timer Latency
  0 00:01:00   |          IRQ Timer Latency (us)        |         Thread Timer Latency (us)
CPU COUNT      |      cur       min       avg       max |      cur       min       avg       max
  0 #60001     |        0         0         0         3 |        1         1         1         6
  1 #60001     |        0         0         0         3 |        2         1         1         5
  2 #60001     |        0         0         1         6 |        1         1         2         7
  3 #60001     |        0         0         0         7 |        1         1         1        11
 ---------- >% ----------

Running:
  # rtla timerlat --help
  # rtla timerlat top --help
provides information about the available options.

Link: https://lkml.kernel.org/r/e95032e20c2b88c962195bf7693bb53c9ebcced8.1639158831.git.bristot@kernel.org

Cc: Tao Zhou <tao.zhou@linux.dev>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Daniel Bristot de Oliveira <bristot@kernel.org>
Cc: linux-rt-users@vger.kernel.org
Cc: linux-trace-devel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 tools/tracing/rtla/Makefile           |   2 +
 tools/tracing/rtla/src/rtla.c         |   5 +
 tools/tracing/rtla/src/timerlat.c     |  68 ++++
 tools/tracing/rtla/src/timerlat.h     |   4 +
 tools/tracing/rtla/src/timerlat_top.c | 618 ++++++++++++++++++++++++++++++++++
 5 files changed, 697 insertions(+)
 create mode 100644 tools/tracing/rtla/src/timerlat.c
 create mode 100644 tools/tracing/rtla/src/timerlat.h
 create mode 100644 tools/tracing/rtla/src/timerlat_top.c

(limited to 'tools')

diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile
index d40b442fcd30..7a7348faec32 100644
--- a/tools/tracing/rtla/Makefile
+++ b/tools/tracing/rtla/Makefile
@@ -62,6 +62,8 @@ install:
 	$(STRIP) $(DESTDIR)$(BINDIR)/rtla
 	@test ! -f $(DESTDIR)$(BINDIR)/osnoise || rm $(DESTDIR)$(BINDIR)/osnoise
 	ln -s $(DESTDIR)$(BINDIR)/rtla $(DESTDIR)$(BINDIR)/osnoise
+	@test ! -f $(DESTDIR)$(BINDIR)/timerlat || rm $(DESTDIR)$(BINDIR)/timerlat
+	ln -s $(DESTDIR)$(BINDIR)/rtla $(DESTDIR)$(BINDIR)/timerlat
 
 .PHONY: clean tarball
 clean:
diff --git a/tools/tracing/rtla/src/rtla.c b/tools/tracing/rtla/src/rtla.c
index 669b9750b3b3..09bd21b8af81 100644
--- a/tools/tracing/rtla/src/rtla.c
+++ b/tools/tracing/rtla/src/rtla.c
@@ -9,6 +9,7 @@
 #include <stdio.h>
 
 #include "osnoise.h"
+#include "timerlat.h"
 
 /*
  * rtla_usage - print rtla usage
@@ -25,6 +26,7 @@ static void rtla_usage(void)
 		"",
 		"  commands:",
 		"     osnoise  - gives information about the operating system noise (osnoise)",
+		"     timerlat - measures the timer irq and thread latency",
 		"",
 		NULL,
 	};
@@ -45,6 +47,9 @@ int run_command(int argc, char **argv, int start_position)
 	if (strcmp(argv[start_position], "osnoise") == 0) {
 		osnoise_main(argc-start_position, &argv[start_position]);
 		goto ran;
+	} else if (strcmp(argv[start_position], "timerlat") == 0) {
+		timerlat_main(argc-start_position, &argv[start_position]);
+		goto ran;
 	}
 
 	return 0;
diff --git a/tools/tracing/rtla/src/timerlat.c b/tools/tracing/rtla/src/timerlat.c
new file mode 100644
index 000000000000..29a51c01b084
--- /dev/null
+++ b/tools/tracing/rtla/src/timerlat.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Red Hat Inc, Daniel Bristot de Oliveira <bristot@kernel.org>
+ */
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <pthread.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+
+#include "timerlat.h"
+
+static void timerlat_usage(void)
+{
+	int i;
+
+	static const char * const msg[] = {
+		"",
+		"timerlat version " VERSION,
+		"",
+		"  usage: [rtla] timerlat [MODE] ...",
+		"",
+		"  modes:",
+		"     top  - prints the summary from timerlat tracer",
+		"",
+		"if no MODE is given, the top mode is called, passing the arguments",
+		NULL,
+	};
+
+	for (i = 0; msg[i]; i++)
+		fprintf(stderr, "%s\n", msg[i]);
+	exit(1);
+}
+
+int timerlat_main(int argc, char *argv[])
+{
+	if (argc == 0)
+		goto usage;
+
+	/*
+	 * if timerlat was called without any argument, run the
+	 * default cmdline.
+	 */
+	if (argc == 1) {
+		timerlat_top_main(argc, argv);
+		exit(0);
+	}
+
+	if ((strcmp(argv[1], "-h") == 0) || (strcmp(argv[1], "--help") == 0)) {
+		timerlat_usage();
+		exit(0);
+	} else if (strncmp(argv[1], "-", 1) == 0) {
+		/* the user skipped the tool, call the default one */
+		timerlat_top_main(argc, argv);
+		exit(0);
+	} else if (strcmp(argv[1], "top") == 0) {
+		timerlat_top_main(argc-1, &argv[1]);
+		exit(0);
+	}
+
+usage:
+	timerlat_usage();
+	exit(1);
+}
diff --git a/tools/tracing/rtla/src/timerlat.h b/tools/tracing/rtla/src/timerlat.h
new file mode 100644
index 000000000000..6f9d3925acd0
--- /dev/null
+++ b/tools/tracing/rtla/src/timerlat.h
@@ -0,0 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
+
+int timerlat_top_main(int argc, char *argv[]);
+int timerlat_main(int argc, char *argv[]);
diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c
new file mode 100644
index 000000000000..1ebd5291539c
--- /dev/null
+++ b/tools/tracing/rtla/src/timerlat_top.c
@@ -0,0 +1,618 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Red Hat Inc, Daniel Bristot de Oliveira <bristot@kernel.org>
+ */
+
+#include <getopt.h>
+#include <stdlib.h>
+#include <string.h>
+#include <signal.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <time.h>
+
+#include "utils.h"
+#include "osnoise.h"
+#include "timerlat.h"
+
+struct timerlat_top_params {
+	char			*cpus;
+	char			*monitored_cpus;
+	char			*trace_output;
+	unsigned long long	runtime;
+	long long		stop_us;
+	long long		stop_total_us;
+	long long		timerlat_period_us;
+	long long		print_stack;
+	int			sleep_time;
+	int			output_divisor;
+	int			duration;
+	int			quiet;
+	int			set_sched;
+	struct sched_attr	sched_param;
+};
+
+struct timerlat_top_cpu {
+	int			irq_count;
+	int			thread_count;
+
+	unsigned long long	cur_irq;
+	unsigned long long	min_irq;
+	unsigned long long	sum_irq;
+	unsigned long long	max_irq;
+
+	unsigned long long	cur_thread;
+	unsigned long long	min_thread;
+	unsigned long long	sum_thread;
+	unsigned long long	max_thread;
+};
+
+struct timerlat_top_data {
+	struct timerlat_top_cpu	*cpu_data;
+	int			nr_cpus;
+};
+
+/*
+ * timerlat_free_top - free runtime data
+ */
+static void
+timerlat_free_top(struct timerlat_top_data *data)
+{
+	free(data->cpu_data);
+	free(data);
+}
+
+/*
+ * timerlat_alloc_histogram - alloc runtime data
+ */
+static struct timerlat_top_data *timerlat_alloc_top(int nr_cpus)
+{
+	struct timerlat_top_data *data;
+	int cpu;
+
+	data = calloc(1, sizeof(*data));
+	if (!data)
+		return NULL;
+
+	data->nr_cpus = nr_cpus;
+
+	/* one set of histograms per CPU */
+	data->cpu_data = calloc(1, sizeof(*data->cpu_data) * nr_cpus);
+	if (!data->cpu_data)
+		goto cleanup;
+
+	/* set the min to max */
+	for (cpu = 0; cpu < nr_cpus; cpu++) {
+		data->cpu_data[cpu].min_irq = ~0;
+		data->cpu_data[cpu].min_thread = ~0;
+	}
+
+	return data;
+
+cleanup:
+	timerlat_free_top(data);
+	return NULL;
+}
+
+/*
+ * timerlat_hist_update - record a new timerlat occurent on cpu, updating data
+ */
+static void
+timerlat_top_update(struct osnoise_tool *tool, int cpu,
+		    unsigned long long thread,
+		    unsigned long long latency)
+{
+	struct timerlat_top_data *data = tool->data;
+	struct timerlat_top_cpu *cpu_data = &data->cpu_data[cpu];
+
+	if (!thread) {
+		cpu_data->irq_count++;
+		cpu_data->cur_irq = latency;
+		update_min(&cpu_data->min_irq, &latency);
+		update_sum(&cpu_data->sum_irq, &latency);
+		update_max(&cpu_data->max_irq, &latency);
+	} else {
+		cpu_data->thread_count++;
+		cpu_data->cur_thread = latency;
+		update_min(&cpu_data->min_thread, &latency);
+		update_sum(&cpu_data->sum_thread, &latency);
+		update_max(&cpu_data->max_thread, &latency);
+	}
+}
+
+/*
+ * timerlat_top_handler - this is the handler for timerlat tracer events
+ */
+static int
+timerlat_top_handler(struct trace_seq *s, struct tep_record *record,
+		     struct tep_event *event, void *context)
+{
+	struct trace_instance *trace = context;
+	unsigned long long latency, thread;
+	struct osnoise_tool *top;
+	int cpu = record->cpu;
+
+	top = container_of(trace, struct osnoise_tool, trace);
+
+	tep_get_field_val(s, event, "context", record, &thread, 1);
+	tep_get_field_val(s, event, "timer_latency", record, &latency, 1);
+
+	timerlat_top_update(top, cpu, thread, latency);
+
+	return 0;
+}
+
+/*
+ * timerlat_top_header - print the header of the tool output
+ */
+static void timerlat_top_header(struct osnoise_tool *top)
+{
+	struct timerlat_top_params *params = top->params;
+	struct trace_seq *s = top->trace.seq;
+	char duration[26];
+
+	get_duration(top->start_time, duration, sizeof(duration));
+
+	trace_seq_printf(s, "\033[2;37;40m");
+	trace_seq_printf(s, "                                     Timer Latency                                              ");
+	trace_seq_printf(s, "\033[0;0;0m");
+	trace_seq_printf(s, "\n");
+
+	trace_seq_printf(s, "%-6s   |          IRQ Timer Latency (%s)        |         Thread Timer Latency (%s)\n", duration,
+			params->output_divisor == 1 ? "ns" : "us",
+			params->output_divisor == 1 ? "ns" : "us");
+
+	trace_seq_printf(s, "\033[2;30;47m");
+	trace_seq_printf(s, "CPU COUNT      |      cur       min       avg       max |      cur       min       avg       max");
+	trace_seq_printf(s, "\033[0;0;0m");
+	trace_seq_printf(s, "\n");
+}
+
+/*
+ * timerlat_top_print - prints the output of a given CPU
+ */
+static void timerlat_top_print(struct osnoise_tool *top, int cpu)
+{
+
+	struct timerlat_top_params *params = top->params;
+	struct timerlat_top_data *data = top->data;
+	struct timerlat_top_cpu *cpu_data = &data->cpu_data[cpu];
+	int divisor = params->output_divisor;
+	struct trace_seq *s = top->trace.seq;
+
+	if (divisor == 0)
+		return;
+
+	/*
+	 * Skip if no data is available: is this cpu offline?
+	 */
+	if (!cpu_data->irq_count && !cpu_data->thread_count)
+		return;
+
+	/*
+	 * Unless trace is being lost, IRQ counter is always the max.
+	 */
+	trace_seq_printf(s, "%3d #%-9d |", cpu, cpu_data->irq_count);
+
+	if (!cpu_data->irq_count) {
+		trace_seq_printf(s, "        - ");
+		trace_seq_printf(s, "        - ");
+		trace_seq_printf(s, "        - ");
+		trace_seq_printf(s, "        - |");
+	} else {
+		trace_seq_printf(s, "%9llu ", cpu_data->cur_irq / params->output_divisor);
+		trace_seq_printf(s, "%9llu ", cpu_data->min_irq / params->output_divisor);
+		trace_seq_printf(s, "%9llu ", (cpu_data->sum_irq / cpu_data->irq_count) / divisor);
+		trace_seq_printf(s, "%9llu |", cpu_data->max_irq / divisor);
+	}
+
+	if (!cpu_data->thread_count) {
+		trace_seq_printf(s, "        - ");
+		trace_seq_printf(s, "        - ");
+		trace_seq_printf(s, "        - ");
+		trace_seq_printf(s, "        -\n");
+	} else {
+		trace_seq_printf(s, "%9llu ", cpu_data->cur_thread / divisor);
+		trace_seq_printf(s, "%9llu ", cpu_data->min_thread / divisor);
+		trace_seq_printf(s, "%9llu ",
+				(cpu_data->sum_thread / cpu_data->thread_count) / divisor);
+		trace_seq_printf(s, "%9llu\n", cpu_data->max_thread / divisor);
+	}
+}
+
+/*
+ * clear_terminal - clears the output terminal
+ */
+static void clear_terminal(struct trace_seq *seq)
+{
+	if (!config_debug)
+		trace_seq_printf(seq, "\033c");
+}
+
+/*
+ * timerlat_print_stats - print data for all cpus
+ */
+static void
+timerlat_print_stats(struct timerlat_top_params *params, struct osnoise_tool *top)
+{
+	struct trace_instance *trace = &top->trace;
+	static int nr_cpus = -1;
+	int i;
+
+	if (nr_cpus == -1)
+		nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+
+	if (!params->quiet)
+		clear_terminal(trace->seq);
+
+	timerlat_top_header(top);
+
+	for (i = 0; i < nr_cpus; i++) {
+		if (params->cpus && !params->monitored_cpus[i])
+			continue;
+		timerlat_top_print(top, i);
+	}
+
+	trace_seq_do_printf(trace->seq);
+	trace_seq_reset(trace->seq);
+}
+
+/*
+ * timerlat_top_usage - prints timerlat top usage message
+ */
+static void timerlat_top_usage(char *usage)
+{
+	int i;
+
+	static const char *const msg[] = {
+		"",
+		"  usage: rtla timerlat [top] [-h] [-q] [-d s] [-D] [-n] [-p us] [-i us] [-T us] [-s us] [-t[=file]] \\",
+		"	  [-c cpu-list] [-P priority]",
+		"",
+		"	  -h/--help: print this menu",
+		"	  -p/--period us: timerlat period in us",
+		"	  -i/--irq us: stop trace if the irq latency is higher than the argument in us",
+		"	  -T/--thread us: stop trace if the thread latency is higher than the argument in us",
+		"	  -s/--stack us: save the stack trace at the IRQ if a thread latency is higher than the argument in us",
+		"	  -c/--cpus cpus: run the tracer only on the given cpus",
+		"	  -d/--duration time[m|h|d]: duration of the session in seconds",
+		"	  -D/--debug: print debug info",
+		"	  -t/--trace[=file]: save the stopped trace to [file|timerlat_trace.txt]",
+		"	  -n/--nano: display data in nanoseconds",
+		"	  -q/--quiet print only a summary at the end",
+		"	  -P/--priority o:prio|r:prio|f:prio|d:runtime:period : set scheduling parameters",
+		"		o:prio - use SCHED_OTHER with prio",
+		"		r:prio - use SCHED_RR with prio",
+		"		f:prio - use SCHED_FIFO with prio",
+		"		d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period",
+		"						       in nanoseconds",
+		NULL,
+	};
+
+	if (usage)
+		fprintf(stderr, "%s\n", usage);
+
+	fprintf(stderr, "rtla timerlat top: a per-cpu summary of the timer latency (version %s)\n",
+			VERSION);
+
+	for (i = 0; msg[i]; i++)
+		fprintf(stderr, "%s\n", msg[i]);
+	exit(1);
+}
+
+/*
+ * timerlat_top_parse_args - allocs, parse and fill the cmd line parameters
+ */
+static struct timerlat_top_params
+*timerlat_top_parse_args(int argc, char **argv)
+{
+	struct timerlat_top_params *params;
+	int retval;
+	int c;
+
+	params = calloc(1, sizeof(*params));
+	if (!params)
+		exit(1);
+
+	/* display data in microseconds */
+	params->output_divisor = 1000;
+
+	while (1) {
+		static struct option long_options[] = {
+			{"cpus",		required_argument,	0, 'c'},
+			{"debug",		no_argument,		0, 'D'},
+			{"duration",		required_argument,	0, 'd'},
+			{"help",		no_argument,		0, 'h'},
+			{"irq",			required_argument,	0, 'i'},
+			{"nano",		no_argument,		0, 'n'},
+			{"period",		required_argument,	0, 'p'},
+			{"priority",		required_argument,	0, 'P'},
+			{"quiet",		no_argument,		0, 'q'},
+			{"stack",		required_argument,	0, 's'},
+			{"thread",		required_argument,	0, 'T'},
+			{"trace",		optional_argument,	0, 't'},
+			{0, 0, 0, 0}
+		};
+
+		/* getopt_long stores the option index here. */
+		int option_index = 0;
+
+		c = getopt_long(argc, argv, "c:d:Dhi:np:P:qs:t::T:",
+				 long_options, &option_index);
+
+		/* detect the end of the options. */
+		if (c == -1)
+			break;
+
+		switch (c) {
+		case 'c':
+			retval = parse_cpu_list(optarg, &params->monitored_cpus);
+			if (retval)
+				timerlat_top_usage("\nInvalid -c cpu list\n");
+			params->cpus = optarg;
+			break;
+		case 'D':
+			config_debug = 1;
+			break;
+		case 'd':
+			params->duration = parse_seconds_duration(optarg);
+			if (!params->duration)
+				timerlat_top_usage("Invalid -D duration\n");
+			break;
+		case 'h':
+		case '?':
+			timerlat_top_usage(NULL);
+			break;
+		case 'i':
+			params->stop_us = get_llong_from_str(optarg);
+			break;
+		case 'n':
+			params->output_divisor = 1;
+			break;
+		case 'p':
+			params->timerlat_period_us = get_llong_from_str(optarg);
+			if (params->timerlat_period_us > 1000000)
+				timerlat_top_usage("Period longer than 1 s\n");
+			break;
+		case 'P':
+			retval = parse_prio(optarg, &params->sched_param);
+			if (retval == -1)
+				timerlat_top_usage("Invalid -P priority");
+			params->set_sched = 1;
+			break;
+		case 'q':
+			params->quiet = 1;
+			break;
+		case 's':
+			params->print_stack = get_llong_from_str(optarg);
+			break;
+		case 'T':
+			params->stop_total_us = get_llong_from_str(optarg);
+			break;
+		case 't':
+			if (optarg)
+				/* skip = */
+				params->trace_output = &optarg[1];
+			else
+				params->trace_output = "timerlat_trace.txt";
+			break;
+		default:
+			timerlat_top_usage("Invalid option");
+		}
+	}
+
+	if (geteuid()) {
+		err_msg("rtla needs root permission\n");
+		exit(EXIT_FAILURE);
+	}
+
+	return params;
+}
+
+/*
+ * timerlat_top_apply_config - apply the top configs to the initialized tool
+ */
+static int
+timerlat_top_apply_config(struct osnoise_tool *top, struct timerlat_top_params *params)
+{
+	int retval;
+
+	if (!params->sleep_time)
+		params->sleep_time = 1;
+
+	if (params->cpus) {
+		retval = osnoise_set_cpus(top->context, params->cpus);
+		if (retval) {
+			err_msg("Failed to apply CPUs config\n");
+			goto out_err;
+		}
+	}
+
+	if (params->stop_us) {
+		retval = osnoise_set_stop_us(top->context, params->stop_us);
+		if (retval) {
+			err_msg("Failed to set stop us\n");
+			goto out_err;
+		}
+	}
+
+	if (params->stop_total_us) {
+		retval = osnoise_set_stop_total_us(top->context, params->stop_total_us);
+		if (retval) {
+			err_msg("Failed to set stop total us\n");
+			goto out_err;
+		}
+	}
+
+
+	if (params->timerlat_period_us) {
+		retval = osnoise_set_timerlat_period_us(top->context, params->timerlat_period_us);
+		if (retval) {
+			err_msg("Failed to set timerlat period\n");
+			goto out_err;
+		}
+	}
+
+
+	if (params->print_stack) {
+		retval = osnoise_set_print_stack(top->context, params->print_stack);
+		if (retval) {
+			err_msg("Failed to set print stack\n");
+			goto out_err;
+		}
+	}
+
+	return 0;
+
+out_err:
+	return -1;
+}
+
+/*
+ * timerlat_init_top - initialize a timerlat top tool with parameters
+ */
+static struct osnoise_tool
+*timerlat_init_top(struct timerlat_top_params *params)
+{
+	struct osnoise_tool *top;
+	int nr_cpus;
+
+	nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+
+	top = osnoise_init_tool("timerlat_top");
+	if (!top)
+		return NULL;
+
+	top->data = timerlat_alloc_top(nr_cpus);
+	if (!top->data)
+		goto out_err;
+
+	top->params = params;
+
+	tep_register_event_handler(top->trace.tep, -1, "ftrace", "timerlat",
+				   timerlat_top_handler, top);
+
+	return top;
+
+out_err:
+	osnoise_destroy_tool(top);
+	return NULL;
+}
+
+static int stop_tracing;
+static void stop_top(int sig)
+{
+	stop_tracing = 1;
+}
+
+/*
+ * timerlat_top_set_signals - handles the signal to stop the tool
+ */
+static void
+timerlat_top_set_signals(struct timerlat_top_params *params)
+{
+	signal(SIGINT, stop_top);
+	if (params->duration) {
+		signal(SIGALRM, stop_top);
+		alarm(params->duration);
+	}
+}
+
+int timerlat_top_main(int argc, char *argv[])
+{
+	struct timerlat_top_params *params;
+	struct trace_instance *trace;
+	struct osnoise_tool *record;
+	struct osnoise_tool *top;
+	int return_value = 1;
+	int retval;
+
+	params = timerlat_top_parse_args(argc, argv);
+	if (!params)
+		exit(1);
+
+	top = timerlat_init_top(params);
+	if (!top) {
+		err_msg("Could not init osnoise top\n");
+		goto out_exit;
+	}
+
+	retval = timerlat_top_apply_config(top, params);
+	if (retval) {
+		err_msg("Could not apply config\n");
+		goto out_top;
+	}
+
+	trace = &top->trace;
+
+	retval = enable_timerlat(trace);
+	if (retval) {
+		err_msg("Failed to enable timerlat tracer\n");
+		goto out_top;
+	}
+
+	if (params->set_sched) {
+		retval = set_comm_sched_attr("timerlat/", &params->sched_param);
+		if (retval) {
+			err_msg("Failed to set sched parameters\n");
+			goto out_top;
+		}
+	}
+
+	trace_instance_start(trace);
+
+	if (params->trace_output) {
+		record = osnoise_init_trace_tool("timerlat");
+		if (!record) {
+			err_msg("Failed to enable the trace instance\n");
+			goto out_top;
+		}
+		trace_instance_start(&record->trace);
+	}
+
+	top->start_time = time(NULL);
+	timerlat_top_set_signals(params);
+
+	while (!stop_tracing) {
+		sleep(params->sleep_time);
+
+		retval = tracefs_iterate_raw_events(trace->tep,
+						    trace->inst,
+						    NULL,
+						    0,
+						    collect_registered_events,
+						    trace);
+		if (retval < 0) {
+			err_msg("Error iterating on events\n");
+			goto out_top;
+		}
+
+		if (!params->quiet)
+			timerlat_print_stats(params, top);
+
+		if (!tracefs_trace_is_on(trace->inst))
+			break;
+
+	};
+
+	timerlat_print_stats(params, top);
+
+	return_value = 0;
+
+	if (!tracefs_trace_is_on(trace->inst)) {
+		printf("rtla timelat hit stop tracing\n");
+		if (params->trace_output) {
+			printf("  Saving trace to %s\n", params->trace_output);
+			save_trace_to_file(record->trace.inst, params->trace_output);
+		}
+	}
+
+out_top:
+	timerlat_free_top(top->data);
+	osnoise_destroy_tool(top);
+	if (params->trace_output)
+		osnoise_destroy_tool(record);
+	free(params);
+out_exit:
+	exit(return_value);
+}
-- 
cgit v1.2.3


From 1eeb6328e8b3a917ba0be4965f272147dbd4cf55 Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@kernel.org>
Date: Fri, 10 Dec 2021 19:11:26 +0100
Subject: rtla/timerlat: Add timerlat hist mode

The rtla hist hist mode displays a histogram of each tracer event
occurrence, both for IRQ and timer latencies. The tool also allows
many configurations of the timerlat tracer and the collection of
the tracer output.

Here is one example of the rtla timerlat hist mode output:
  ---------- %< ----------
 [root@alien ~]# rtla timerlat hist -c 0-3 -d 1M
 # RTLA timerlat histogram
 # Time unit is microseconds (us)
 # Duration:   0 00:01:00
 Index   IRQ-000   Thr-000   IRQ-001   Thr-001   IRQ-002   Thr-002   IRQ-003   Thr-003
 0         58572         0     59373         0     58691         0     58895         0
 1          1422     57021       628     57241      1310     56160      1102     56805
 2             6      2931         0      2695         0      3567         4      3031
 3             1        40         0        53         0       260         0       142
 4             0         7         0         5         0         6         0        17
 5             0         2         0         5         0         7         0         4
 6             0         0         0         2         0         1         0         1
 8             0         0         0         0         0         0         0         1
 over:         0         0         0         0         0         0         0         0
 count:    60001     60001     60001     60001     60001     60001     60001     60001
 min:          0         1         0         1         0         1         0         1
 avg:          0         1         0         1         0         1         0         1
 max:          3         5         1         6         1         6         2         8
  ---------- >% ----------

Running
 - rtla timerlat hist --help
provides information about the available options.

Link: https://lkml.kernel.org/r/7049ed3c46b7d6aceab18ffe7770003dfc4ddceb.1639158831.git.bristot@kernel.org

Cc: Tao Zhou <tao.zhou@linux.dev>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Daniel Bristot de Oliveira <bristot@kernel.org>
Cc: linux-rt-users@vger.kernel.org
Cc: linux-trace-devel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 tools/tracing/rtla/src/timerlat.c      |   6 +-
 tools/tracing/rtla/src/timerlat.h      |   2 +-
 tools/tracing/rtla/src/timerlat_hist.c | 822 +++++++++++++++++++++++++++++++++
 3 files changed, 828 insertions(+), 2 deletions(-)
 create mode 100644 tools/tracing/rtla/src/timerlat_hist.c

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/timerlat.c b/tools/tracing/rtla/src/timerlat.c
index 29a51c01b084..97abbf494fee 100644
--- a/tools/tracing/rtla/src/timerlat.c
+++ b/tools/tracing/rtla/src/timerlat.c
@@ -25,7 +25,8 @@ static void timerlat_usage(void)
 		"  usage: [rtla] timerlat [MODE] ...",
 		"",
 		"  modes:",
-		"     top  - prints the summary from timerlat tracer",
+		"     top   - prints the summary from timerlat tracer",
+		"     hist  - prints a histogram of timer latencies",
 		"",
 		"if no MODE is given, the top mode is called, passing the arguments",
 		NULL,
@@ -60,6 +61,9 @@ int timerlat_main(int argc, char *argv[])
 	} else if (strcmp(argv[1], "top") == 0) {
 		timerlat_top_main(argc-1, &argv[1]);
 		exit(0);
+	} else if (strcmp(argv[1], "hist") == 0) {
+		timerlat_hist_main(argc-1, &argv[1]);
+		exit(0);
 	}
 
 usage:
diff --git a/tools/tracing/rtla/src/timerlat.h b/tools/tracing/rtla/src/timerlat.h
index 6f9d3925acd0..88561bfd14f3 100644
--- a/tools/tracing/rtla/src/timerlat.h
+++ b/tools/tracing/rtla/src/timerlat.h
@@ -1,4 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0
-
+int timerlat_hist_main(int argc, char *argv[]);
 int timerlat_top_main(int argc, char *argv[]);
 int timerlat_main(int argc, char *argv[]);
diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c
new file mode 100644
index 000000000000..235f9620ef3d
--- /dev/null
+++ b/tools/tracing/rtla/src/timerlat_hist.c
@@ -0,0 +1,822 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Red Hat Inc, Daniel Bristot de Oliveira <bristot@kernel.org>
+ */
+
+#include <getopt.h>
+#include <stdlib.h>
+#include <string.h>
+#include <signal.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <time.h>
+
+#include "utils.h"
+#include "osnoise.h"
+#include "timerlat.h"
+
+struct timerlat_hist_params {
+	char			*cpus;
+	char			*monitored_cpus;
+	char			*trace_output;
+	unsigned long long	runtime;
+	long long		stop_us;
+	long long		stop_total_us;
+	long long		timerlat_period_us;
+	long long		print_stack;
+	int			sleep_time;
+	int			output_divisor;
+	int			duration;
+	int			set_sched;
+	struct sched_attr	sched_param;
+
+	char			no_irq;
+	char			no_thread;
+	char			no_header;
+	char			no_summary;
+	char			no_index;
+	char			with_zeros;
+	int			bucket_size;
+	int			entries;
+};
+
+struct timerlat_hist_cpu {
+	int			*irq;
+	int			*thread;
+
+	int			irq_count;
+	int			thread_count;
+
+	unsigned long long	min_irq;
+	unsigned long long	sum_irq;
+	unsigned long long	max_irq;
+
+	unsigned long long	min_thread;
+	unsigned long long	sum_thread;
+	unsigned long long	max_thread;
+};
+
+struct timerlat_hist_data {
+	struct timerlat_hist_cpu	*hist;
+	int				entries;
+	int				bucket_size;
+	int				nr_cpus;
+};
+
+/*
+ * timerlat_free_histogram - free runtime data
+ */
+static void
+timerlat_free_histogram(struct timerlat_hist_data *data)
+{
+	int cpu;
+
+	/* one histogram for IRQ and one for thread, per CPU */
+	for (cpu = 0; cpu < data->nr_cpus; cpu++) {
+		if (data->hist[cpu].irq)
+			free(data->hist[cpu].irq);
+
+		if (data->hist[cpu].thread)
+			free(data->hist[cpu].thread);
+	}
+
+	/* one set of histograms per CPU */
+	if (data->hist)
+		free(data->hist);
+
+	free(data);
+}
+
+/*
+ * timerlat_alloc_histogram - alloc runtime data
+ */
+static struct timerlat_hist_data
+*timerlat_alloc_histogram(int nr_cpus, int entries, int bucket_size)
+{
+	struct timerlat_hist_data *data;
+	int cpu;
+
+	data = calloc(1, sizeof(*data));
+	if (!data)
+		return NULL;
+
+	data->entries = entries;
+	data->bucket_size = bucket_size;
+	data->nr_cpus = nr_cpus;
+
+	/* one set of histograms per CPU */
+	data->hist = calloc(1, sizeof(*data->hist) * nr_cpus);
+	if (!data->hist)
+		goto cleanup;
+
+	/* one histogram for IRQ and one for thread, per cpu */
+	for (cpu = 0; cpu < nr_cpus; cpu++) {
+		data->hist[cpu].irq = calloc(1, sizeof(*data->hist->irq) * (entries + 1));
+		if (!data->hist[cpu].irq)
+			goto cleanup;
+		data->hist[cpu].thread = calloc(1, sizeof(*data->hist->thread) * (entries + 1));
+		if (!data->hist[cpu].thread)
+			goto cleanup;
+	}
+
+	/* set the min to max */
+	for (cpu = 0; cpu < nr_cpus; cpu++) {
+		data->hist[cpu].min_irq = ~0;
+		data->hist[cpu].min_thread = ~0;
+	}
+
+	return data;
+
+cleanup:
+	timerlat_free_histogram(data);
+	return NULL;
+}
+
+/*
+ * timerlat_hist_update - record a new timerlat occurent on cpu, updating data
+ */
+static void
+timerlat_hist_update(struct osnoise_tool *tool, int cpu,
+		     unsigned long long thread,
+		     unsigned long long latency)
+{
+	struct timerlat_hist_params *params = tool->params;
+	struct timerlat_hist_data *data = tool->data;
+	int entries = data->entries;
+	int bucket;
+	int *hist;
+
+	if (params->output_divisor)
+		latency = latency / params->output_divisor;
+
+	if (data->bucket_size)
+		bucket = latency / data->bucket_size;
+
+	if (!thread) {
+		hist = data->hist[cpu].irq;
+		data->hist[cpu].irq_count++;
+		update_min(&data->hist[cpu].min_irq, &latency);
+		update_sum(&data->hist[cpu].sum_irq, &latency);
+		update_max(&data->hist[cpu].max_irq, &latency);
+	} else {
+		hist = data->hist[cpu].thread;
+		data->hist[cpu].thread_count++;
+		update_min(&data->hist[cpu].min_thread, &latency);
+		update_sum(&data->hist[cpu].sum_thread, &latency);
+		update_max(&data->hist[cpu].max_thread, &latency);
+	}
+
+	if (bucket < entries)
+		hist[bucket]++;
+	else
+		hist[entries]++;
+}
+
+/*
+ * timerlat_hist_handler - this is the handler for timerlat tracer events
+ */
+static int
+timerlat_hist_handler(struct trace_seq *s, struct tep_record *record,
+		     struct tep_event *event, void *data)
+{
+	struct trace_instance *trace = data;
+	unsigned long long thread, latency;
+	struct osnoise_tool *tool;
+	int cpu = record->cpu;
+
+	tool = container_of(trace, struct osnoise_tool, trace);
+
+	tep_get_field_val(s, event, "context", record, &thread, 1);
+	tep_get_field_val(s, event, "timer_latency", record, &latency, 1);
+
+	timerlat_hist_update(tool, cpu, thread, latency);
+
+	return 0;
+}
+
+/*
+ * timerlat_hist_header - print the header of the tracer to the output
+ */
+static void timerlat_hist_header(struct osnoise_tool *tool)
+{
+	struct timerlat_hist_params *params = tool->params;
+	struct timerlat_hist_data *data = tool->data;
+	struct trace_seq *s = tool->trace.seq;
+	char duration[26];
+	int cpu;
+
+	if (params->no_header)
+		return;
+
+	get_duration(tool->start_time, duration, sizeof(duration));
+	trace_seq_printf(s, "# RTLA timerlat histogram\n");
+	trace_seq_printf(s, "# Time unit is %s (%s)\n",
+			params->output_divisor == 1 ? "nanoseconds" : "microseconds",
+			params->output_divisor == 1 ? "ns" : "us");
+
+	trace_seq_printf(s, "# Duration: %s\n", duration);
+
+	if (!params->no_index)
+		trace_seq_printf(s, "Index");
+
+	for (cpu = 0; cpu < data->nr_cpus; cpu++) {
+		if (params->cpus && !params->monitored_cpus[cpu])
+			continue;
+
+		if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count)
+			continue;
+
+		if (!params->no_irq)
+			trace_seq_printf(s, "   IRQ-%03d", cpu);
+
+		if (!params->no_thread)
+			trace_seq_printf(s, "   Thr-%03d", cpu);
+	}
+	trace_seq_printf(s, "\n");
+
+
+	trace_seq_do_printf(s);
+	trace_seq_reset(s);
+}
+
+/*
+ * timerlat_print_summary - print the summary of the hist data to the output
+ */
+static void
+timerlat_print_summary(struct timerlat_hist_params *params,
+		       struct trace_instance *trace,
+		       struct timerlat_hist_data *data)
+{
+	int cpu;
+
+	if (params->no_summary)
+		return;
+
+	if (!params->no_index)
+		trace_seq_printf(trace->seq, "count:");
+
+	for (cpu = 0; cpu < data->nr_cpus; cpu++) {
+		if (params->cpus && !params->monitored_cpus[cpu])
+			continue;
+
+		if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count)
+			continue;
+
+		if (!params->no_irq)
+			trace_seq_printf(trace->seq, "%9d ",
+					data->hist[cpu].irq_count);
+
+		if (!params->no_thread)
+			trace_seq_printf(trace->seq, "%9d ",
+					data->hist[cpu].thread_count);
+	}
+	trace_seq_printf(trace->seq, "\n");
+
+	if (!params->no_index)
+		trace_seq_printf(trace->seq, "min:  ");
+
+	for (cpu = 0; cpu < data->nr_cpus; cpu++) {
+		if (params->cpus && !params->monitored_cpus[cpu])
+			continue;
+
+		if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count)
+			continue;
+
+		if (!params->no_irq)
+			trace_seq_printf(trace->seq, "%9llu ",
+					data->hist[cpu].min_irq);
+
+		if (!params->no_thread)
+			trace_seq_printf(trace->seq, "%9llu ",
+					data->hist[cpu].min_thread);
+	}
+	trace_seq_printf(trace->seq, "\n");
+
+	if (!params->no_index)
+		trace_seq_printf(trace->seq, "avg:  ");
+
+	for (cpu = 0; cpu < data->nr_cpus; cpu++) {
+		if (params->cpus && !params->monitored_cpus[cpu])
+			continue;
+
+		if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count)
+			continue;
+
+		if (!params->no_irq) {
+			if (data->hist[cpu].irq_count)
+				trace_seq_printf(trace->seq, "%9llu ",
+						 data->hist[cpu].sum_irq / data->hist[cpu].irq_count);
+			else
+				trace_seq_printf(trace->seq, "        - ");
+		}
+
+		if (!params->no_thread) {
+			if (data->hist[cpu].thread_count)
+				trace_seq_printf(trace->seq, "%9llu ",
+						data->hist[cpu].sum_thread / data->hist[cpu].thread_count);
+			else
+				trace_seq_printf(trace->seq, "        - ");
+		}
+	}
+	trace_seq_printf(trace->seq, "\n");
+
+	if (!params->no_index)
+		trace_seq_printf(trace->seq, "max:  ");
+
+	for (cpu = 0; cpu < data->nr_cpus; cpu++) {
+		if (params->cpus && !params->monitored_cpus[cpu])
+			continue;
+
+		if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count)
+			continue;
+
+		if (!params->no_irq)
+			trace_seq_printf(trace->seq, "%9llu ",
+					data->hist[cpu].max_irq);
+
+		if (!params->no_thread)
+			trace_seq_printf(trace->seq, "%9llu ",
+					data->hist[cpu].max_thread);
+	}
+	trace_seq_printf(trace->seq, "\n");
+	trace_seq_do_printf(trace->seq);
+	trace_seq_reset(trace->seq);
+}
+
+/*
+ * timerlat_print_stats - print data for all CPUs
+ */
+static void
+timerlat_print_stats(struct timerlat_hist_params *params, struct osnoise_tool *tool)
+{
+	struct timerlat_hist_data *data = tool->data;
+	struct trace_instance *trace = &tool->trace;
+	int bucket, cpu;
+	int total;
+
+	timerlat_hist_header(tool);
+
+	for (bucket = 0; bucket < data->entries; bucket++) {
+		total = 0;
+
+		if (!params->no_index)
+			trace_seq_printf(trace->seq, "%-6d",
+					 bucket * data->bucket_size);
+
+		for (cpu = 0; cpu < data->nr_cpus; cpu++) {
+			if (params->cpus && !params->monitored_cpus[cpu])
+				continue;
+
+			if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count)
+				continue;
+
+			if (!params->no_irq) {
+				total += data->hist[cpu].irq[bucket];
+				trace_seq_printf(trace->seq, "%9d ",
+						data->hist[cpu].irq[bucket]);
+			}
+
+			if (!params->no_thread) {
+				total += data->hist[cpu].thread[bucket];
+				trace_seq_printf(trace->seq, "%9d ",
+						data->hist[cpu].thread[bucket]);
+			}
+
+		}
+
+		if (total == 0 && !params->with_zeros) {
+			trace_seq_reset(trace->seq);
+			continue;
+		}
+
+		trace_seq_printf(trace->seq, "\n");
+		trace_seq_do_printf(trace->seq);
+		trace_seq_reset(trace->seq);
+	}
+
+	if (!params->no_index)
+		trace_seq_printf(trace->seq, "over: ");
+
+	for (cpu = 0; cpu < data->nr_cpus; cpu++) {
+		if (params->cpus && !params->monitored_cpus[cpu])
+			continue;
+
+		if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count)
+			continue;
+
+		if (!params->no_irq)
+			trace_seq_printf(trace->seq, "%9d ",
+					 data->hist[cpu].irq[data->entries]);
+
+		if (!params->no_thread)
+			trace_seq_printf(trace->seq, "%9d ",
+					 data->hist[cpu].thread[data->entries]);
+	}
+	trace_seq_printf(trace->seq, "\n");
+	trace_seq_do_printf(trace->seq);
+	trace_seq_reset(trace->seq);
+
+	timerlat_print_summary(params, trace, data);
+}
+
+/*
+ * timerlat_hist_usage - prints timerlat top usage message
+ */
+static void timerlat_hist_usage(char *usage)
+{
+	int i;
+
+	char *msg[] = {
+		"",
+		"  usage: [rtla] timerlat hist [-h] [-q] [-d s] [-D] [-n] [-p us] [-i us] [-T us] [-s us] [-t[=file]] \\",
+		"         [-c cpu-list] [-P priority] [-e N] [-b N]  [--no-irq] [--no-thread] [--no-header] [--no-summary] \\",
+		"         [--no-index] [--with-zeros]",
+		"",
+		"	  -h/--help: print this menu",
+		"	  -p/--period us: timerlat period in us",
+		"	  -i/--irq us: stop trace if the irq latency is higher than the argument in us",
+		"	  -T/--thread us: stop trace if the thread latency is higher than the argument in us",
+		"	  -s/--stack us: save the stack trace at the IRQ if a thread latency is higher than the argument in us",
+		"	  -c/--cpus cpus: run the tracer only on the given cpus",
+		"	  -d/--duration time[m|h|d]: duration of the session in seconds",
+		"	  -D/--debug: print debug info",
+		"	  -T/--trace[=file]: save the stopped trace to [file|timerlat_trace.txt]",
+		"	  -n/--nano: display data in nanoseconds",
+		"	  -b/--bucket-size N: set the histogram bucket size (default 1)",
+		"	  -e/--entries N: set the number of entries of the histogram (default 256)",
+		"	     --no-irq: ignore IRQ latencies",
+		"	     --no-thread: ignore thread latencies",
+		"	     --no-header: do not print header",
+		"	     --no-summary: do not print summary",
+		"	     --no-index: do not print index",
+		"	     --with-zeros: print zero only entries",
+		"	  -P/--priority o:prio|r:prio|f:prio|d:runtime:period : set scheduling parameters",
+		"		o:prio - use SCHED_OTHER with prio",
+		"		r:prio - use SCHED_RR with prio",
+		"		f:prio - use SCHED_FIFO with prio",
+		"		d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period",
+		"						       in nanoseconds",
+		NULL,
+	};
+
+	if (usage)
+		fprintf(stderr, "%s\n", usage);
+
+	fprintf(stderr, "rtla timerlat hist: a per-cpu histogram of the timer latency (version %s)\n",
+			VERSION);
+
+	for (i = 0; msg[i]; i++)
+		fprintf(stderr, "%s\n", msg[i]);
+	exit(1);
+}
+
+/*
+ * timerlat_hist_parse_args - allocs, parse and fill the cmd line parameters
+ */
+static struct timerlat_hist_params
+*timerlat_hist_parse_args(int argc, char *argv[])
+{
+	struct timerlat_hist_params *params;
+	int retval;
+	int c;
+
+	params = calloc(1, sizeof(*params));
+	if (!params)
+		exit(1);
+
+	/* display data in microseconds */
+	params->output_divisor = 1000;
+	params->bucket_size = 1;
+	params->entries = 256;
+
+	while (1) {
+		static struct option long_options[] = {
+			{"cpus",		required_argument,	0, 'c'},
+			{"bucket-size",		required_argument,	0, 'b'},
+			{"debug",		no_argument,		0, 'D'},
+			{"entries",		required_argument,	0, 'e'},
+			{"duration",		required_argument,	0, 'd'},
+			{"help",		no_argument,		0, 'h'},
+			{"irq",			required_argument,	0, 'i'},
+			{"nano",		no_argument,		0, 'n'},
+			{"period",		required_argument,	0, 'p'},
+			{"priority",		required_argument,	0, 'P'},
+			{"stack",		required_argument,	0, 's'},
+			{"thread",		required_argument,	0, 'T'},
+			{"trace",		optional_argument,	0, 't'},
+			{"no-irq",		no_argument,		0, '0'},
+			{"no-thread",		no_argument,		0, '1'},
+			{"no-header",		no_argument,		0, '2'},
+			{"no-summary",		no_argument,		0, '3'},
+			{"no-index",		no_argument,		0, '4'},
+			{"with-zeros",		no_argument,		0, '5'},
+			{0, 0, 0, 0}
+		};
+
+		/* getopt_long stores the option index here. */
+		int option_index = 0;
+
+		c = getopt_long(argc, argv, "c:b:d:e:Dhi:np:P:s:t::T:012345",
+				 long_options, &option_index);
+
+		/* detect the end of the options. */
+		if (c == -1)
+			break;
+
+		switch (c) {
+		case 'c':
+			retval = parse_cpu_list(optarg, &params->monitored_cpus);
+			if (retval)
+				timerlat_hist_usage("\nInvalid -c cpu list\n");
+			params->cpus = optarg;
+			break;
+		case 'b':
+			params->bucket_size = get_llong_from_str(optarg);
+			if ((params->bucket_size == 0) || (params->bucket_size >= 1000000))
+				timerlat_hist_usage("Bucket size needs to be > 0 and <= 1000000\n");
+			break;
+		case 'D':
+			config_debug = 1;
+			break;
+		case 'd':
+			params->duration = parse_seconds_duration(optarg);
+			if (!params->duration)
+				timerlat_hist_usage("Invalid -D duration\n");
+			break;
+		case 'e':
+			params->entries = get_llong_from_str(optarg);
+			if ((params->entries < 10) || (params->entries > 9999999))
+					timerlat_hist_usage("Entries must be > 10 and < 9999999\n");
+			break;
+		case 'h':
+		case '?':
+			timerlat_hist_usage(NULL);
+			break;
+		case 'i':
+			params->stop_us = get_llong_from_str(optarg);
+			break;
+		case 'n':
+			params->output_divisor = 1;
+			break;
+		case 'p':
+			params->timerlat_period_us = get_llong_from_str(optarg);
+			if (params->timerlat_period_us > 1000000)
+				timerlat_hist_usage("Period longer than 1 s\n");
+			break;
+		case 'P':
+			retval = parse_prio(optarg, &params->sched_param);
+			if (retval == -1)
+				timerlat_hist_usage("Invalid -P priority");
+			params->set_sched = 1;
+			break;
+		case 's':
+			params->print_stack = get_llong_from_str(optarg);
+			break;
+		case 'T':
+			params->stop_total_us = get_llong_from_str(optarg);
+			break;
+		case 't':
+			if (optarg)
+				/* skip = */
+				params->trace_output = &optarg[1];
+			else
+				params->trace_output = "timerlat_trace.txt";
+			break;
+		case '0': /* no irq */
+			params->no_irq = 1;
+			break;
+		case '1': /* no thread */
+			params->no_thread = 1;
+			break;
+		case '2': /* no header */
+			params->no_header = 1;
+			break;
+		case '3': /* no summary */
+			params->no_summary = 1;
+			break;
+		case '4': /* no index */
+			params->no_index = 1;
+			break;
+		case '5': /* with zeros */
+			params->with_zeros = 1;
+			break;
+		default:
+			timerlat_hist_usage("Invalid option");
+		}
+	}
+
+	if (geteuid()) {
+		err_msg("rtla needs root permission\n");
+		exit(EXIT_FAILURE);
+	}
+
+	if (params->no_irq && params->no_thread)
+		timerlat_hist_usage("no-irq and no-thread set, there is nothing to do here");
+
+	if (params->no_index && !params->with_zeros)
+		timerlat_hist_usage("no-index set with with-zeros is not set - it does not make sense");
+
+	return params;
+}
+
+/*
+ * timerlat_hist_apply_config - apply the hist configs to the initialized tool
+ */
+static int
+timerlat_hist_apply_config(struct osnoise_tool *tool, struct timerlat_hist_params *params)
+{
+	int retval;
+
+	if (!params->sleep_time)
+		params->sleep_time = 1;
+
+	if (params->cpus) {
+		retval = osnoise_set_cpus(tool->context, params->cpus);
+		if (retval) {
+			err_msg("Failed to apply CPUs config\n");
+			goto out_err;
+		}
+	}
+
+	if (params->stop_us) {
+		retval = osnoise_set_stop_us(tool->context, params->stop_us);
+		if (retval) {
+			err_msg("Failed to set stop us\n");
+			goto out_err;
+		}
+	}
+
+	if (params->stop_total_us) {
+		retval = osnoise_set_stop_total_us(tool->context, params->stop_total_us);
+		if (retval) {
+			err_msg("Failed to set stop total us\n");
+			goto out_err;
+		}
+	}
+
+	if (params->timerlat_period_us) {
+		retval = osnoise_set_timerlat_period_us(tool->context, params->timerlat_period_us);
+		if (retval) {
+			err_msg("Failed to set timerlat period\n");
+			goto out_err;
+		}
+	}
+
+	if (params->print_stack) {
+		retval = osnoise_set_print_stack(tool->context, params->print_stack);
+		if (retval) {
+			err_msg("Failed to set print stack\n");
+			goto out_err;
+		}
+	}
+
+	return 0;
+
+out_err:
+	return -1;
+}
+
+/*
+ * timerlat_init_hist - initialize a timerlat hist tool with parameters
+ */
+static struct osnoise_tool
+*timerlat_init_hist(struct timerlat_hist_params *params)
+{
+	struct osnoise_tool *tool;
+	int nr_cpus;
+
+	nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+
+	tool = osnoise_init_tool("timerlat_hist");
+	if (!tool)
+		return NULL;
+
+	tool->data = timerlat_alloc_histogram(nr_cpus, params->entries, params->bucket_size);
+	if (!tool->data)
+		goto out_err;
+
+	tool->params = params;
+
+	tep_register_event_handler(tool->trace.tep, -1, "ftrace", "timerlat",
+				   timerlat_hist_handler, tool);
+
+	return tool;
+
+out_err:
+	osnoise_destroy_tool(tool);
+	return NULL;
+}
+
+static int stop_tracing;
+static void stop_hist(int sig)
+{
+	stop_tracing = 1;
+}
+
+/*
+ * timerlat_hist_set_signals - handles the signal to stop the tool
+ */
+static void
+timerlat_hist_set_signals(struct timerlat_hist_params *params)
+{
+	signal(SIGINT, stop_hist);
+	if (params->duration) {
+		signal(SIGALRM, stop_hist);
+		alarm(params->duration);
+	}
+}
+
+int timerlat_hist_main(int argc, char *argv[])
+{
+	struct timerlat_hist_params *params;
+	struct trace_instance *trace;
+	struct osnoise_tool *record;
+	struct osnoise_tool *tool;
+	int return_value = 1;
+	int retval;
+
+	params = timerlat_hist_parse_args(argc, argv);
+	if (!params)
+		exit(1);
+
+	tool = timerlat_init_hist(params);
+	if (!tool) {
+		err_msg("Could not init osnoise hist\n");
+		goto out_exit;
+	}
+
+	retval = timerlat_hist_apply_config(tool, params);
+	if (retval) {
+		err_msg("Could not apply config\n");
+		goto out_hist;
+	}
+
+	trace = &tool->trace;
+
+	retval = enable_timerlat(trace);
+	if (retval) {
+		err_msg("Failed to enable timerlat tracer\n");
+		goto out_hist;
+	}
+
+	if (params->set_sched) {
+		retval = set_comm_sched_attr("timerlat/", &params->sched_param);
+		if (retval) {
+			err_msg("Failed to set sched parameters\n");
+			goto out_hist;
+		}
+	}
+
+	trace_instance_start(trace);
+
+	if (params->trace_output) {
+		record = osnoise_init_trace_tool("timerlat");
+		if (!record) {
+			err_msg("Failed to enable the trace instance\n");
+			goto out_hist;
+		}
+		trace_instance_start(&record->trace);
+	}
+
+	tool->start_time = time(NULL);
+	timerlat_hist_set_signals(params);
+
+	while (!stop_tracing) {
+		sleep(params->sleep_time);
+
+		retval = tracefs_iterate_raw_events(trace->tep,
+						    trace->inst,
+						    NULL,
+						    0,
+						    collect_registered_events,
+						    trace);
+		if (retval < 0) {
+			err_msg("Error iterating on events\n");
+			goto out_hist;
+		}
+
+		if (!tracefs_trace_is_on(trace->inst))
+			break;
+	};
+
+	timerlat_print_stats(params, tool);
+
+	return_value = 0;
+
+	if (!tracefs_trace_is_on(trace->inst)) {
+		printf("rtla timelat hit stop tracing\n");
+		if (params->trace_output) {
+			printf("  Saving trace to %s\n", params->trace_output);
+			save_trace_to_file(record->trace.inst, params->trace_output);
+		}
+	}
+
+out_hist:
+	timerlat_free_histogram(tool->data);
+	osnoise_destroy_tool(tool);
+	if (params->trace_output)
+		osnoise_destroy_tool(record);
+	free(params);
+out_exit:
+	exit(return_value);
+}
-- 
cgit v1.2.3


From d40d48e1f1f25abe755521e33a67f5d2b8b9443c Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@kernel.org>
Date: Fri, 10 Dec 2021 19:11:27 +0100
Subject: rtla: Add Documentation

Adds the basis for rtla documentation. This patch also
includes the rtla(1) man page.

As suggested by Jonathan Corbet, we are placing these man
pages at Documentation/tools/rtla, using rst format. It
is not linked to the official documentation, though.

The Makefile is based on bpftool's Documentation one.

Link: https://lkml.kernel.org/r/5f510f3e962fc0cd531c43f5a815544dd720c3f2.1639158831.git.bristot@kernel.org

Cc: Tao Zhou <tao.zhou@linux.dev>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Daniel Bristot de Oliveira <bristot@kernel.org>
Cc: linux-rt-users@vger.kernel.org
Cc: linux-trace-devel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Suggested-by: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 Documentation/tools/rtla/Makefile            | 41 ++++++++++++++++++++++++
 Documentation/tools/rtla/common_appendix.rst | 12 +++++++
 Documentation/tools/rtla/rtla.rst            | 48 ++++++++++++++++++++++++++++
 tools/tracing/rtla/Makefile                  | 30 ++++++++++++++---
 4 files changed, 127 insertions(+), 4 deletions(-)
 create mode 100644 Documentation/tools/rtla/Makefile
 create mode 100644 Documentation/tools/rtla/common_appendix.rst
 create mode 100644 Documentation/tools/rtla/rtla.rst

(limited to 'tools')

diff --git a/Documentation/tools/rtla/Makefile b/Documentation/tools/rtla/Makefile
new file mode 100644
index 000000000000..9f2b84af1a6c
--- /dev/null
+++ b/Documentation/tools/rtla/Makefile
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# Based on bpftool's Documentation Makefile
+
+INSTALL		?= install
+RM		?= rm -f
+RMDIR		?= rmdir --ignore-fail-on-non-empty
+
+PREFIX		?= /usr/share
+MANDIR		?= $(PREFIX)/man
+MAN1DIR		= $(MANDIR)/man1
+
+MAN1_RST	= $(wildcard rtla*.rst)
+
+_DOC_MAN1	= $(patsubst %.rst,%.1,$(MAN1_RST))
+DOC_MAN1	= $(addprefix $(OUTPUT),$(_DOC_MAN1))
+
+RST2MAN_DEP	:= $(shell command -v rst2man 2>/dev/null)
+RST2MAN_OPTS	+= --verbose
+
+$(OUTPUT)%.1: %.rst
+ifndef RST2MAN_DEP
+	$(error "rst2man not found, but required to generate man pages")
+endif
+	rst2man $(RST2MAN_OPTS) $< > $@
+
+man1: $(DOC_MAN1)
+man: man1
+
+clean:
+	$(RM) $(DOC_MAN1)
+
+install: man
+	$(INSTALL) -d -m 755 $(DESTDIR)$(MAN1DIR)
+	$(INSTALL) -m 644 $(DOC_MAN1) $(DESTDIR)$(MAN1DIR)
+
+uninstall:
+	$(RM) $(addprefix $(DESTDIR)$(MAN1DIR)/,$(_DOC_MAN1))
+	$(RMDIR) $(DESTDIR)$(MAN1DIR)
+
+.PHONY: man man1 clean install uninstall
+.DEFAULT_GOAL := man
diff --git a/Documentation/tools/rtla/common_appendix.rst b/Documentation/tools/rtla/common_appendix.rst
new file mode 100644
index 000000000000..b494084acccd
--- /dev/null
+++ b/Documentation/tools/rtla/common_appendix.rst
@@ -0,0 +1,12 @@
+REPORTING BUGS
+==============
+Report bugs to <lkml@vger.kernel.org>
+
+LICENSE
+=======
+**rtla** is Free Software licensed under the GNU GPLv2
+
+COPYING
+=======
+Copyright \(C) 2021 Red Hat, Inc. Free use of this software is granted under
+the terms of the GNU Public License (GPL).
diff --git a/Documentation/tools/rtla/rtla.rst b/Documentation/tools/rtla/rtla.rst
new file mode 100644
index 000000000000..fc0d233efcd5
--- /dev/null
+++ b/Documentation/tools/rtla/rtla.rst
@@ -0,0 +1,48 @@
+=========
+rtla
+=========
+--------------------------------
+Real-time Linux Analysis tool
+--------------------------------
+
+:Manual section: 1
+
+SYNOPSIS
+========
+**rtla** *COMMAND* [*OPTIONS*]
+
+DESCRIPTION
+===========
+The **rtla** is a meta-tool that includes a set of commands that aims to
+analyze the real-time properties of Linux. But instead of testing Linux
+as a black box, **rtla** leverages kernel tracing capabilities to provide
+precise information about the properties and root causes of unexpected
+results.
+
+COMMANDS
+========
+**osnoise**
+
+        Gives information about the operating system noise (osnoise).
+
+**timerlat**
+
+        Measures the IRQ and thread timer latency.
+
+OPTIONS
+=======
+**-h**, **--help**
+
+        Display the help text.
+
+For other options, see the man page for the corresponding command.
+
+SEE ALSO
+========
+**rtla-osnoise**\(1), **rtla-timerlat**\(1)
+
+AUTHOR
+======
+Daniel Bristot de Oliveira <bristot@kernel.org>
+
+.. include:: common_appendix.rst
diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile
index 7a7348faec32..2d52ff0bff7d 100644
--- a/tools/tracing/rtla/Makefile
+++ b/tools/tracing/rtla/Makefile
@@ -45,18 +45,28 @@ DATADIR	:=	/usr/share
 DOCDIR	:=	$(DATADIR)/doc
 MANDIR	:=	$(DATADIR)/man
 LICDIR	:=	$(DATADIR)/licenses
+SRCTREE	:=	$(if $(BUILD_SRC),$(BUILD_SRC),$(CURDIR))
+
+# If running from the tarball, man pages are stored in the Documentation
+# dir. If running from the kernel source, man pages are stored in
+# Documentation/tools/rtla/.
+ifneq ($(wildcard Documentation/.*),)
+DOCSRC	=	Documentation/
+else
+DOCSRC	=	$(SRCTREE)/../../../Documentation/tools/rtla/
+endif
 
 .PHONY:	all
 all:	rtla
 
-rtla: $(OBJ)
+rtla: $(OBJ) doc
 	$(CC) -o rtla $(LDFLAGS) $(OBJ) $(LIBS)
 
 static: $(OBJ)
 	$(CC) -o rtla-static $(LDFLAGS) --static $(OBJ) $(LIBS) -lpthread -ldl
 
 .PHONY: install
-install:
+install: doc_install
 	$(INSTALL) -d -m 755 $(DESTDIR)$(BINDIR)
 	$(INSTALL) rtla -m 755 $(DESTDIR)$(BINDIR)
 	$(STRIP) $(DESTDIR)$(BINDIR)/rtla
@@ -66,15 +76,27 @@ install:
 	ln -s $(DESTDIR)$(BINDIR)/rtla $(DESTDIR)$(BINDIR)/timerlat
 
 .PHONY: clean tarball
-clean:
+clean: doc_clean
 	@test ! -f rtla || rm rtla
 	@test ! -f rtla-static || rm rtla-static
 	@test ! -f src/rtla.o || rm src/rtla.o
 	@test ! -f $(TARBALL) || rm -f $(TARBALL)
 	@rm -rf *~ $(OBJ) *.tar.$(CEXT)
 
-tarball:  clean
+tarball: clean
 	rm -rf $(NAME)-$(VERSION) && mkdir $(NAME)-$(VERSION)
 	cp -r $(DIRS) $(FILES) $(NAME)-$(VERSION)
+	mkdir $(NAME)-$(VERSION)/Documentation/
+	cp -rp $(SRCTREE)/../../../Documentation/tools/rtla/* $(NAME)-$(VERSION)/Documentation/
 	tar $(TAROPTS) --exclude='*~' $(NAME)-$(VERSION)
 	rm -rf $(NAME)-$(VERSION)
+
+.PHONY: doc doc_clean doc_install
+doc:
+	$(MAKE) -C $(DOCSRC)
+
+doc_clean:
+	$(MAKE) -C $(DOCSRC) clean
+
+doc_install:
+	$(MAKE) -C $(DOCSRC) install
-- 
cgit v1.2.3


From 2255634100bffe0abc1a300f20aee26980891ba4 Mon Sep 17 00:00:00 2001
From: Li Zhijian <lizhijian@fujitsu.com>
Date: Fri, 14 Jan 2022 11:02:46 +0800
Subject: kselftests/net: list all available tests in usage()

So that users can run/query them easily.

$ ./fcnal-test.sh -h
usage: fcnal-test.sh OPTS

	-4          IPv4 tests only
	-6          IPv6 tests only
	-t <test>   Test name/set to run
	-p          Pause on fail
	-P          Pause after each test
	-v          Be verbose

Tests:
	ipv4_ping ipv4_tcp ipv4_udp ipv4_bind ipv4_runtime ipv4_netfilter ipv6_ping ipv6_tcp ipv6_udp ipv6_bind ipv6_runtime ipv6_netfilter use_cases

Suggested-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/fcnal-test.sh | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh
index 412d85205546..3f4c8cfe7aca 100755
--- a/tools/testing/selftests/net/fcnal-test.sh
+++ b/tools/testing/selftests/net/fcnal-test.sh
@@ -4059,6 +4059,9 @@ usage: ${0##*/} OPTS
 	-p          Pause on fail
 	-P          Pause after each test
 	-v          Be verbose
+
+Tests:
+	$TESTS_IPV4 $TESTS_IPV6 $TESTS_OTHER
 EOF
 }
 
-- 
cgit v1.2.3


From e652ab64e5846d3fe5ac2c0405d55d79ecc52c36 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 7 Aug 2020 08:45:47 -0300
Subject: tools arch x86: Sync the msr-index.h copy with the kernel sources

To pick up the changes in:

  89aa94b4a218339b ("x86/msr: Add AMD CPPC MSR definitions")

Addressing these tools/perf build warnings:

    diff -u tools/arch/x86/include/asm/msr-index.h arch/x86/include/asm/msr-index.h
    Warning: Kernel ABI header at 'tools/arch/x86/include/asm/msr-index.h' differs from latest version at 'arch/x86/include/asm/msr-index.h'

That makes the beautification scripts to pick some new entries:

  $ tools/perf/trace/beauty/tracepoints/x86_msr.sh > before
  $ cp arch/x86/include/asm/msr-index.h tools/arch/x86/include/asm/msr-index.h
  $ tools/perf/trace/beauty/tracepoints/x86_msr.sh > after
  $ diff -u before after
  --- before	2022-01-13 10:59:51.743416890 -0300
  +++ after	2022-01-13 11:00:00.776644178 -0300
  @@ -303,6 +303,11 @@
 	  [0xc0010299 - x86_AMD_V_KVM_MSRs_offset] = "AMD_RAPL_POWER_UNIT",
 	  [0xc001029a - x86_AMD_V_KVM_MSRs_offset] = "AMD_CORE_ENERGY_STATUS",
 	  [0xc001029b - x86_AMD_V_KVM_MSRs_offset] = "AMD_PKG_ENERGY_STATUS",
  +       [0xc00102b0 - x86_AMD_V_KVM_MSRs_offset] = "AMD_CPPC_CAP1",
  +       [0xc00102b1 - x86_AMD_V_KVM_MSRs_offset] = "AMD_CPPC_ENABLE",
  +       [0xc00102b2 - x86_AMD_V_KVM_MSRs_offset] = "AMD_CPPC_CAP2",
  +       [0xc00102b3 - x86_AMD_V_KVM_MSRs_offset] = "AMD_CPPC_REQ",
  +       [0xc00102b4 - x86_AMD_V_KVM_MSRs_offset] = "AMD_CPPC_STATUS",
 	  [0xc00102f0 - x86_AMD_V_KVM_MSRs_offset] = "AMD_PPIN_CTL",
 	  [0xc00102f1 - x86_AMD_V_KVM_MSRs_offset] = "AMD_PPIN",
   };
  $

And this gets rebuilt:

  CC       /tmp/build/perf/trace/beauty/tracepoints/x86_msr.o
  INSTALL  trace_plugins
  LD       /tmp/build/perf/trace/beauty/tracepoints/perf-in.o
  LD       /tmp/build/perf/trace/beauty/perf-in.o
  LD       /tmp/build/perf/perf-in.o
  LINK     /tmp/build/perf/perf

Now one can trace systemwide asking to see backtraces to where those
MSRs are being read/written with:

  # perf trace -e msr:*_msr/max-stack=32/ --filter="msr>=AMD_CPPC_CAP1 && msr<=AMD_CPPC_STATUS"
  ^C#

If we use -v (verbose mode) we can see what it does behind the scenes:

  # perf trace -v -e msr:*_msr/max-stack=32/ --filter="msr>=AMD_CPPC_CAP1 && msr<=AMD_CPPC_STATUS"
  <SNIP>
  New filter for msr:read_msr: (msr>=0xc00102b0 && msr<=0xc00102b4) && (common_pid != 2612102 && common_pid != 3841)
  New filter for msr:write_msr: (msr>=0xc00102b0 && msr<=0xc00102b4) && (common_pid != 2612102 && common_pid != 3841)
  <SNIP>
  ^C#

Example with a frequent msr:

  # perf trace -v -e msr:*_msr/max-stack=32/ --filter="msr==IA32_SPEC_CTRL" --max-events 2
  Using CPUID AuthenticAMD-25-21-0
  0x48
  New filter for msr:read_msr: (msr==0x48) && (common_pid != 2612129 && common_pid != 3841)
  0x48
  New filter for msr:write_msr: (msr==0x48) && (common_pid != 2612129 && common_pid != 3841)
  mmap size 528384B
  Looking at the vmlinux_path (8 entries long)
  symsrc__init: build id mismatch for vmlinux.
  Using /proc/kcore for kernel data
  Using /proc/kallsyms for symbols
       0.000 Timer/2525383 msr:write_msr(msr: IA32_SPEC_CTRL, val: 6)
                                         do_trace_write_msr ([kernel.kallsyms])
                                         do_trace_write_msr ([kernel.kallsyms])
                                         __switch_to_xtra ([kernel.kallsyms])
                                         __switch_to ([kernel.kallsyms])
                                         __schedule ([kernel.kallsyms])
                                         schedule ([kernel.kallsyms])
                                         futex_wait_queue_me ([kernel.kallsyms])
                                         futex_wait ([kernel.kallsyms])
                                         do_futex ([kernel.kallsyms])
                                         __x64_sys_futex ([kernel.kallsyms])
                                         do_syscall_64 ([kernel.kallsyms])
                                         entry_SYSCALL_64_after_hwframe ([kernel.kallsyms])
                                         __futex_abstimed_wait_common64 (/usr/lib64/libpthread-2.33.so)
       0.030 :0/0 msr:write_msr(msr: IA32_SPEC_CTRL, val: 2)
                                         do_trace_write_msr ([kernel.kallsyms])
                                         do_trace_write_msr ([kernel.kallsyms])
                                         __switch_to_xtra ([kernel.kallsyms])
                                         __switch_to ([kernel.kallsyms])
                                         __schedule ([kernel.kallsyms])
                                         schedule_idle ([kernel.kallsyms])
                                         do_idle ([kernel.kallsyms])
                                         cpu_startup_entry ([kernel.kallsyms])
                                         secondary_startup_64_no_verify ([kernel.kallsyms])
  #

Acked-by: Huang Rui <ray.huang@amd.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/all/YeA2PAvHV+uHRhLj@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/arch/x86/include/asm/msr-index.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'tools')

diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h
index 01e2650b9585..3faf0f97edb1 100644
--- a/tools/arch/x86/include/asm/msr-index.h
+++ b/tools/arch/x86/include/asm/msr-index.h
@@ -486,6 +486,23 @@
 
 #define MSR_AMD64_VIRT_SPEC_CTRL	0xc001011f
 
+/* AMD Collaborative Processor Performance Control MSRs */
+#define MSR_AMD_CPPC_CAP1		0xc00102b0
+#define MSR_AMD_CPPC_ENABLE		0xc00102b1
+#define MSR_AMD_CPPC_CAP2		0xc00102b2
+#define MSR_AMD_CPPC_REQ		0xc00102b3
+#define MSR_AMD_CPPC_STATUS		0xc00102b4
+
+#define AMD_CPPC_LOWEST_PERF(x)		(((x) >> 0) & 0xff)
+#define AMD_CPPC_LOWNONLIN_PERF(x)	(((x) >> 8) & 0xff)
+#define AMD_CPPC_NOMINAL_PERF(x)	(((x) >> 16) & 0xff)
+#define AMD_CPPC_HIGHEST_PERF(x)	(((x) >> 24) & 0xff)
+
+#define AMD_CPPC_MAX_PERF(x)		(((x) & 0xff) << 0)
+#define AMD_CPPC_MIN_PERF(x)		(((x) & 0xff) << 8)
+#define AMD_CPPC_DES_PERF(x)		(((x) & 0xff) << 16)
+#define AMD_CPPC_ENERGY_PERF_PREF(x)	(((x) & 0xff) << 24)
+
 /* Fam 17h MSRs */
 #define MSR_F17H_IRPERF			0xc00000e9
 
-- 
cgit v1.2.3


From 46f57d2410150985f81da7cbbb5fdcda01d02ac2 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 13 Jan 2022 22:48:22 -0800
Subject: perf arm: Fix off-by-one directory path

Relative path include works in the regular build due to -I paths but may
fail in other situations.

Fixes: 83869019c74cc2d0 ("perf arch: Support register names from all archs")
Reviewed-by: German Gomez <german.gomez@arm.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Truong <alexandre.truong@arm.com>
Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220114064822.1806019-1-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/arm64-frame-pointer-unwind-support.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/arm64-frame-pointer-unwind-support.c b/tools/perf/util/arm64-frame-pointer-unwind-support.c
index 4f5ecf51ed38..2242a885fbd7 100644
--- a/tools/perf/util/arm64-frame-pointer-unwind-support.c
+++ b/tools/perf/util/arm64-frame-pointer-unwind-support.c
@@ -6,7 +6,7 @@
 #include "unwind.h"
 
 #define perf_event_arm_regs perf_event_arm64_regs
-#include "../arch/arm64/include/uapi/asm/perf_regs.h"
+#include "../../arch/arm64/include/uapi/asm/perf_regs.h"
 #undef perf_event_arm_regs
 
 struct entries {
-- 
cgit v1.2.3


From 99fc11bb5b6f19d2c3671d6cf38571cb3dedb472 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 13 Jan 2022 22:51:05 -0800
Subject: libperf tests: Update a use of the new cpumap API

Fixes a build breakage.

Fixes: 6d18804b963b78dc ("perf cpumap: Give CPUs their own type")
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: colin ian king <colin.king@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Shunsuke Nakamura <nakamura.shun@fujitsu.com>
Link: http://lore.kernel.org/lkml/20220114065105.1806542-1-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/perf/tests/test-evlist.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/perf/tests/test-evlist.c b/tools/lib/perf/tests/test-evlist.c
index e7afff12c35a..b3479dfa9a1c 100644
--- a/tools/lib/perf/tests/test-evlist.c
+++ b/tools/lib/perf/tests/test-evlist.c
@@ -334,7 +334,8 @@ static int test_mmap_cpus(void)
 	};
 	cpu_set_t saved_mask;
 	char path[PATH_MAX];
-	int id, err, cpu, tmp;
+	int id, err, tmp;
+	struct perf_cpu cpu;
 	union perf_event *event;
 	int count = 0;
 
@@ -377,7 +378,7 @@ static int test_mmap_cpus(void)
 		cpu_set_t mask;
 
 		CPU_ZERO(&mask);
-		CPU_SET(cpu, &mask);
+		CPU_SET(cpu.cpu, &mask);
 
 		err = sched_setaffinity(0, sizeof(mask), &mask);
 		__T("sched_setaffinity failed", err == 0);
-- 
cgit v1.2.3


From e000ea0beffb5497425054b151369fe37a792ece Mon Sep 17 00:00:00 2001
From: José Expósito <jose.exposito89@gmail.com>
Date: Wed, 8 Dec 2021 18:11:13 +0100
Subject: perf metricgroup: Fix use after free in metric__new()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We shouldn't free() something that will be used in the next line, fix
it.

Fixes: b85a4d61d3022608 ("perf metric: Allow modifiers on metrics")
Addresses-Coverity-ID: 1494000
Signed-off-by: José Expósito <jose.exposito89@gmail.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lore.kernel.org/lkml/20211208171113.22089-1-jose.exposito89@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/metricgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
index 51c99cb08abf..8826c555f780 100644
--- a/tools/perf/util/metricgroup.c
+++ b/tools/perf/util/metricgroup.c
@@ -209,8 +209,8 @@ static struct metric *metric__new(const struct pmu_event *pe,
 	m->metric_name = pe->metric_name;
 	m->modifier = modifier ? strdup(modifier) : NULL;
 	if (modifier && !m->modifier) {
-		free(m);
 		expr__ctx_free(m->pctx);
+		free(m);
 		return NULL;
 	}
 	m->metric_expr = pe->metric_expr;
-- 
cgit v1.2.3


From a6e62743621ea29bea461774c0bcc68e5de59068 Mon Sep 17 00:00:00 2001
From: Thomas Richter <tmricht@linux.ibm.com>
Date: Wed, 24 Nov 2021 10:03:43 +0100
Subject: perf cputopo: Fix CPU topology reading on s/390

Commit fdf1e29b6118c18f ("perf expr: Add metric literals for topology.")
fails on s390:

 # ./perf test -Fv 7
   ...
 # FAILED tests/expr.c:173 #num_dies >= #num_packages
   ---- end ----
   Simple expression parser: FAILED!
 #

Investigating this issue leads to these functions:
 build_cpu_topology()
   +--> has_die_topology(void)
        {
           struct utsname uts;

           if (uname(&uts) < 0)
                  return false;
           if (strncmp(uts.machine, "x86_64", 6))
                  return false;
           ....
        }

which always returns false on s390. The caller build_cpu_topology()
checks has_die_topology() return value. On false the
the struct cpu_topology::die_cpu_list is not contructed and has zero
entries. This leads to the failing comparison: #num_dies >= #num_packages.
s390 of course has a positive number of packages.

Fix this by adding s390 architecture to support CPU die list.

Output after:
 # ./perf test -Fv 7
  7: Simple expression parser                                        :
  --- start ---
  division by zero
  syntax error
  ---- end ----
  Simple expression parser: Ok
 #

Fixes: fdf1e29b6118c18f ("perf expr: Add metric literals for topology.")
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Link: https://lore.kernel.org/r/20211124090343.9436-1-tmricht@linux.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/cputopo.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/cputopo.c b/tools/perf/util/cputopo.c
index 84ca106a3246..e20b835a1194 100644
--- a/tools/perf/util/cputopo.c
+++ b/tools/perf/util/cputopo.c
@@ -165,7 +165,8 @@ static bool has_die_topology(void)
 	if (uname(&uts) < 0)
 		return false;
 
-	if (strncmp(uts.machine, "x86_64", 6))
+	if (strncmp(uts.machine, "x86_64", 6) &&
+	    strncmp(uts.machine, "s390x", 5))
 		return false;
 
 	scnprintf(filename, MAXPATHLEN, DIE_CPUS_FMT,
-- 
cgit v1.2.3


From 415a3c33e847349c0f76575b3ebfdfae2f5a681a Mon Sep 17 00:00:00 2001
From: Wei Wang <wei.w.wang@intel.com>
Date: Wed, 5 Jan 2022 04:35:30 -0800
Subject: kvm: selftests: Add support for KVM_CAP_XSAVE2

When KVM_CAP_XSAVE2 is supported, userspace is expected to allocate
buffer for KVM_GET_XSAVE2 and KVM_SET_XSAVE using the size returned
by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2).

Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Signed-off-by: Guang Zeng <guang.zeng@intel.com>
Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20220105123532.12586-20-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/arch/x86/include/uapi/asm/kvm.h              | 16 +++++-
 tools/include/uapi/linux/kvm.h                     |  3 +
 .../testing/selftests/kvm/include/kvm_util_base.h  |  2 +
 .../selftests/kvm/include/x86_64/processor.h       | 10 ++++
 tools/testing/selftests/kvm/lib/kvm_util.c         | 32 +++++++++++
 tools/testing/selftests/kvm/lib/x86_64/processor.c | 67 +++++++++++++++++++++-
 tools/testing/selftests/kvm/x86_64/evmcs_test.c    |  2 +-
 tools/testing/selftests/kvm/x86_64/smm_test.c      |  2 +-
 tools/testing/selftests/kvm/x86_64/state_test.c    |  2 +-
 .../kvm/x86_64/vmx_preemption_timer_test.c         |  2 +-
 10 files changed, 130 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
index 5a776a08f78c..2da3316bb559 100644
--- a/tools/arch/x86/include/uapi/asm/kvm.h
+++ b/tools/arch/x86/include/uapi/asm/kvm.h
@@ -373,9 +373,23 @@ struct kvm_debugregs {
 	__u64 reserved[9];
 };
 
-/* for KVM_CAP_XSAVE */
+/* for KVM_CAP_XSAVE and KVM_CAP_XSAVE2 */
 struct kvm_xsave {
+	/*
+	 * KVM_GET_XSAVE2 and KVM_SET_XSAVE write and read as many bytes
+	 * as are returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
+	 * respectively, when invoked on the vm file descriptor.
+	 *
+	 * The size value returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
+	 * will always be at least 4096. Currently, it is only greater
+	 * than 4096 if a dynamic feature has been enabled with
+	 * ``arch_prctl()``, but this may change in the future.
+	 *
+	 * The offsets of the state save areas in struct kvm_xsave follow
+	 * the contents of CPUID leaf 0xD on the host.
+	 */
 	__u32 region[1024];
+	__u32 extra[0];
 };
 
 #define KVM_MAX_XCRS	16
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index 1daa45268de2..f066637ee206 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -1131,6 +1131,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204
 #define KVM_CAP_ARM_MTE 205
 #define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206
+#define KVM_CAP_XSAVE2 207
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1551,6 +1552,8 @@ struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_XSAVE */
 #define KVM_GET_XSAVE		  _IOR(KVMIO,  0xa4, struct kvm_xsave)
 #define KVM_SET_XSAVE		  _IOW(KVMIO,  0xa5, struct kvm_xsave)
+/* Available with KVM_CAP_XSAVE2 */
+#define KVM_GET_XSAVE2		  _IOR(KVMIO,  0xcf, struct kvm_xsave)
 /* Available with KVM_CAP_XCRS */
 #define KVM_GET_XCRS		  _IOR(KVMIO,  0xa6, struct kvm_xcrs)
 #define KVM_SET_XCRS		  _IOW(KVMIO,  0xa7, struct kvm_xcrs)
diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h
index 1e5ab6a92848..66775de26952 100644
--- a/tools/testing/selftests/kvm/include/kvm_util_base.h
+++ b/tools/testing/selftests/kvm/include/kvm_util_base.h
@@ -103,6 +103,7 @@ extern const struct vm_guest_mode_params vm_guest_mode_params[];
 int open_path_or_exit(const char *path, int flags);
 int open_kvm_dev_path_or_exit(void);
 int kvm_check_cap(long cap);
+int vm_check_cap(struct kvm_vm *vm, long cap);
 int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap);
 int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id,
 		    struct kvm_enable_cap *cap);
@@ -344,6 +345,7 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus,
  *   guest_code - The vCPU's entry point
  */
 void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code);
+void vm_xsave_req_perm(void);
 
 bool vm_is_unrestricted_guest(struct kvm_vm *vm);
 
diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index 05e65ca1c30c..58633e51960f 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -10,8 +10,10 @@
 
 #include <assert.h>
 #include <stdint.h>
+#include <syscall.h>
 
 #include <asm/msr-index.h>
+#include <asm/prctl.h>
 
 #include "../kvm_util.h"
 
@@ -352,6 +354,7 @@ struct kvm_x86_state;
 struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid);
 void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid,
 		     struct kvm_x86_state *state);
+void kvm_x86_state_cleanup(struct kvm_x86_state *state);
 
 struct kvm_msr_list *kvm_get_msr_index_list(void);
 uint64_t kvm_get_feature_msr(uint64_t msr_index);
@@ -443,4 +446,11 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
 /* VMX_EPT_VPID_CAP bits */
 #define VMX_EPT_VPID_CAP_AD_BITS       (1ULL << 21)
 
+#define XSTATE_XTILE_CFG_BIT		17
+#define XSTATE_XTILE_DATA_BIT		18
+
+#define XSTATE_XTILE_CFG_MASK		(1ULL << XSTATE_XTILE_CFG_BIT)
+#define XSTATE_XTILE_DATA_MASK		(1ULL << XSTATE_XTILE_DATA_BIT)
+#define XFEATURE_XTILE_MASK		(XSTATE_XTILE_CFG_MASK | \
+					XSTATE_XTILE_DATA_MASK)
 #endif /* SELFTEST_KVM_PROCESSOR_H */
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index ecc53d108ad8..4a645dc77f34 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -85,6 +85,33 @@ int kvm_check_cap(long cap)
 	return ret;
 }
 
+/* VM Check Capability
+ *
+ * Input Args:
+ *   vm - Virtual Machine
+ *   cap - Capability
+ *
+ * Output Args: None
+ *
+ * Return:
+ *   On success, the Value corresponding to the capability (KVM_CAP_*)
+ *   specified by the value of cap.  On failure a TEST_ASSERT failure
+ *   is produced.
+ *
+ * Looks up and returns the value corresponding to the capability
+ * (KVM_CAP_*) given by cap.
+ */
+int vm_check_cap(struct kvm_vm *vm, long cap)
+{
+	int ret;
+
+	ret = ioctl(vm->fd, KVM_CHECK_EXTENSION, cap);
+	TEST_ASSERT(ret >= 0, "KVM_CHECK_EXTENSION VM IOCTL failed,\n"
+		"  rc: %i errno: %i", ret, errno);
+
+	return ret;
+}
+
 /* VM Enable Capability
  *
  * Input Args:
@@ -366,6 +393,11 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus,
 	struct kvm_vm *vm;
 	int i;
 
+	/*
+	 * Permission needs to be requested before KVM_SET_CPUID2.
+	 */
+	vm_xsave_req_perm();
+
 	/* Force slot0 memory size not small than DEFAULT_GUEST_PHY_PAGES */
 	if (slot0_mem_pages < DEFAULT_GUEST_PHY_PAGES)
 		slot0_mem_pages = DEFAULT_GUEST_PHY_PAGES;
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index eef7b34756d5..f19d6d201977 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -650,6 +650,45 @@ static void vcpu_setup(struct kvm_vm *vm, int vcpuid)
 	vcpu_sregs_set(vm, vcpuid, &sregs);
 }
 
+#define CPUID_XFD_BIT (1 << 4)
+static bool is_xfd_supported(void)
+{
+	int eax, ebx, ecx, edx;
+	const int leaf = 0xd, subleaf = 0x1;
+
+	__asm__ __volatile__(
+		"cpuid"
+		: /* output */ "=a"(eax), "=b"(ebx),
+		  "=c"(ecx), "=d"(edx)
+		: /* input */ "0"(leaf), "2"(subleaf));
+
+	return !!(eax & CPUID_XFD_BIT);
+}
+
+void vm_xsave_req_perm(void)
+{
+	unsigned long bitmask;
+	long rc;
+
+	if (!is_xfd_supported())
+		return;
+
+	rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM,
+		     XSTATE_XTILE_DATA_BIT);
+	/*
+	 * The older kernel version(<5.15) can't support
+	 * ARCH_REQ_XCOMP_GUEST_PERM and directly return.
+	 */
+	if (rc)
+		return;
+
+	rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &bitmask);
+	TEST_ASSERT(rc == 0, "prctl(ARCH_GET_XCOMP_GUEST_PERM) error: %ld", rc);
+	TEST_ASSERT(bitmask & XFEATURE_XTILE_MASK,
+		    "prctl(ARCH_REQ_XCOMP_GUEST_PERM) failure bitmask=0x%lx",
+		    bitmask);
+}
+
 void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code)
 {
 	struct kvm_mp_state mp_state;
@@ -1018,10 +1057,10 @@ void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent)
 }
 
 struct kvm_x86_state {
+	struct kvm_xsave *xsave;
 	struct kvm_vcpu_events events;
 	struct kvm_mp_state mp_state;
 	struct kvm_regs regs;
-	struct kvm_xsave xsave;
 	struct kvm_xcrs xcrs;
 	struct kvm_sregs sregs;
 	struct kvm_debugregs debugregs;
@@ -1069,6 +1108,22 @@ struct kvm_msr_list *kvm_get_msr_index_list(void)
 	return list;
 }
 
+static int vcpu_save_xsave_state(struct kvm_vm *vm, struct vcpu *vcpu,
+				 struct kvm_x86_state *state)
+{
+	int size;
+
+	size = vm_check_cap(vm, KVM_CAP_XSAVE2);
+	if (!size)
+		size = sizeof(struct kvm_xsave);
+
+	state->xsave = malloc(size);
+	if (size == sizeof(struct kvm_xsave))
+		return ioctl(vcpu->fd, KVM_GET_XSAVE, state->xsave);
+	else
+		return ioctl(vcpu->fd, KVM_GET_XSAVE2, state->xsave);
+}
+
 struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
 {
 	struct vcpu *vcpu = vcpu_find(vm, vcpuid);
@@ -1112,7 +1167,7 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
         TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i",
                 r);
 
-	r = ioctl(vcpu->fd, KVM_GET_XSAVE, &state->xsave);
+	r = vcpu_save_xsave_state(vm, vcpu, state);
         TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i",
                 r);
 
@@ -1157,7 +1212,7 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *s
 	struct vcpu *vcpu = vcpu_find(vm, vcpuid);
 	int r;
 
-	r = ioctl(vcpu->fd, KVM_SET_XSAVE, &state->xsave);
+	r = ioctl(vcpu->fd, KVM_SET_XSAVE, state->xsave);
         TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i",
                 r);
 
@@ -1198,6 +1253,12 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *s
 	}
 }
 
+void kvm_x86_state_cleanup(struct kvm_x86_state *state)
+{
+	free(state->xsave);
+	free(state);
+}
+
 bool is_intel_cpu(void)
 {
 	int eax, ebx, ecx, edx;
diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c
index 2b46dcca86a8..4c7841dfd481 100644
--- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c
+++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c
@@ -129,7 +129,7 @@ static void save_restore_vm(struct kvm_vm *vm)
 	vcpu_set_hv_cpuid(vm, VCPU_ID);
 	vcpu_enable_evmcs(vm, VCPU_ID);
 	vcpu_load_state(vm, VCPU_ID, state);
-	free(state);
+	kvm_x86_state_cleanup(state);
 
 	memset(&regs2, 0, sizeof(regs2));
 	vcpu_regs_get(vm, VCPU_ID, &regs2);
diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c
index d0fe2fdce58c..2da8eb8e2d96 100644
--- a/tools/testing/selftests/kvm/x86_64/smm_test.c
+++ b/tools/testing/selftests/kvm/x86_64/smm_test.c
@@ -212,7 +212,7 @@ int main(int argc, char *argv[])
 		vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
 		vcpu_load_state(vm, VCPU_ID, state);
 		run = vcpu_state(vm, VCPU_ID);
-		free(state);
+		kvm_x86_state_cleanup(state);
 	}
 
 done:
diff --git a/tools/testing/selftests/kvm/x86_64/state_test.c b/tools/testing/selftests/kvm/x86_64/state_test.c
index 32854c1462ad..2e0a92da8ff5 100644
--- a/tools/testing/selftests/kvm/x86_64/state_test.c
+++ b/tools/testing/selftests/kvm/x86_64/state_test.c
@@ -218,7 +218,7 @@ int main(int argc, char *argv[])
 		vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
 		vcpu_load_state(vm, VCPU_ID, state);
 		run = vcpu_state(vm, VCPU_ID);
-		free(state);
+		kvm_x86_state_cleanup(state);
 
 		memset(&regs2, 0, sizeof(regs2));
 		vcpu_regs_get(vm, VCPU_ID, &regs2);
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c
index a07480aed397..ff92e25b6f1e 100644
--- a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c
+++ b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c
@@ -244,7 +244,7 @@ int main(int argc, char *argv[])
 		vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
 		vcpu_load_state(vm, VCPU_ID, state);
 		run = vcpu_state(vm, VCPU_ID);
-		free(state);
+		kvm_x86_state_cleanup(state);
 
 		memset(&regs2, 0, sizeof(regs2));
 		vcpu_regs_get(vm, VCPU_ID, &regs2);
-- 
cgit v1.2.3


From 551447cfa5dc208b7fba7aa98391d5cc8149fa5a Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 23 Dec 2021 09:53:20 -0500
Subject: selftest: kvm: Reorder vcpu_load_state steps for AMX

For AMX support it is recommended to load XCR0 after XFD, so
that KVM does not see XFD=0, XCR=1 for a save state that will
eventually be disabled (which would lead to premature allocation
of the space required for that save state).

It is also required to load XSAVE data after XCR0 and XFD, so
that KVM can trigger allocation of the extra space required to
store AMX state.

Adjust vcpu_load_state to obey these new requirements.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20211223145322.2914028-2-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/lib/x86_64/processor.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index f19d6d201977..93264424aee5 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -1212,24 +1212,25 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *s
 	struct vcpu *vcpu = vcpu_find(vm, vcpuid);
 	int r;
 
-	r = ioctl(vcpu->fd, KVM_SET_XSAVE, state->xsave);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i",
+	r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs);
+	TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_SREGS, r: %i",
                 r);
 
+	r = ioctl(vcpu->fd, KVM_SET_MSRS, &state->msrs);
+	TEST_ASSERT(r == state->msrs.nmsrs,
+		"Unexpected result from KVM_SET_MSRS, r: %i (failed at %x)",
+		r, r == state->msrs.nmsrs ? -1 : state->msrs.entries[r].index);
+
 	if (kvm_check_cap(KVM_CAP_XCRS)) {
 		r = ioctl(vcpu->fd, KVM_SET_XCRS, &state->xcrs);
 		TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XCRS, r: %i",
 			    r);
 	}
 
-	r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_SREGS, r: %i",
+	r = ioctl(vcpu->fd, KVM_SET_XSAVE, state->xsave);
+	TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i",
                 r);
 
-	r = ioctl(vcpu->fd, KVM_SET_MSRS, &state->msrs);
-        TEST_ASSERT(r == state->msrs.nmsrs, "Unexpected result from KVM_SET_MSRS, r: %i (failed at %x)",
-                r, r == state->msrs.nmsrs ? -1 : state->msrs.entries[r].index);
-
 	r = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, &state->events);
         TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i",
                 r);
-- 
cgit v1.2.3


From 6559b4a523cd65f6005b4592833b16ba970abdf5 Mon Sep 17 00:00:00 2001
From: Yang Zhong <yang.zhong@intel.com>
Date: Thu, 23 Dec 2021 09:53:21 -0500
Subject: selftest: kvm: Move struct kvm_x86_state to header

Those changes can avoid dereferencing pointer compile issue
when amx_test.c reference state->xsave.

Move struct kvm_x86_state definition to processor.h.

Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20211223145322.2914028-3-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/include/x86_64/processor.h | 16 +++++++++++++++-
 tools/testing/selftests/kvm/lib/x86_64/processor.c     | 15 ---------------
 2 files changed, 15 insertions(+), 16 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index 58633e51960f..e94ba0fc67d8 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -94,6 +94,21 @@ struct desc_ptr {
 	uint64_t address;
 } __attribute__((packed));
 
+struct kvm_x86_state {
+	struct kvm_xsave *xsave;
+	struct kvm_vcpu_events events;
+	struct kvm_mp_state mp_state;
+	struct kvm_regs regs;
+	struct kvm_xcrs xcrs;
+	struct kvm_sregs sregs;
+	struct kvm_debugregs debugregs;
+	union {
+		struct kvm_nested_state nested;
+		char nested_[16384];
+	};
+	struct kvm_msrs msrs;
+};
+
 static inline uint64_t get_desc64_base(const struct desc64 *desc)
 {
 	return ((uint64_t)desc->base3 << 32) |
@@ -350,7 +365,6 @@ static inline unsigned long get_xmm(int n)
 
 bool is_intel_cpu(void);
 
-struct kvm_x86_state;
 struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid);
 void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid,
 		     struct kvm_x86_state *state);
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index 93264424aee5..babb0f28575c 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -1056,21 +1056,6 @@ void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent)
 	sregs_dump(stream, &sregs, indent + 4);
 }
 
-struct kvm_x86_state {
-	struct kvm_xsave *xsave;
-	struct kvm_vcpu_events events;
-	struct kvm_mp_state mp_state;
-	struct kvm_regs regs;
-	struct kvm_xcrs xcrs;
-	struct kvm_sregs sregs;
-	struct kvm_debugregs debugregs;
-	union {
-		struct kvm_nested_state nested;
-		char nested_[16384];
-	};
-	struct kvm_msrs msrs;
-};
-
 static int kvm_get_num_msrs_fd(int kvm_fd)
 {
 	struct kvm_msr_list nmsrs;
-- 
cgit v1.2.3


From bf70636d9443c9e0718fd98765ba634e631ed079 Mon Sep 17 00:00:00 2001
From: Yang Zhong <yang.zhong@intel.com>
Date: Thu, 23 Dec 2021 09:53:22 -0500
Subject: selftest: kvm: Add amx selftest

This selftest covers two aspects of AMX.  The first is triggering #NM
exception and checking the MSR XFD_ERR value.  The second case is
loading tile config and tile data into guest registers and trapping to
the host side for a complete save/load of the guest state.  TMM0
is also checked against memory data after save/restore.

Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20211223145322.2914028-4-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/Makefile          |   1 +
 tools/testing/selftests/kvm/x86_64/amx_test.c | 448 ++++++++++++++++++++++++++
 2 files changed, 449 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/x86_64/amx_test.c

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index c407ebbec2c1..ee8cf2149824 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -82,6 +82,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/xen_shinfo_test
 TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_pi_mmio_test
 TEST_GEN_PROGS_x86_64 += x86_64/sev_migrate_tests
+TEST_GEN_PROGS_x86_64 += x86_64/amx_test
 TEST_GEN_PROGS_x86_64 += demand_paging_test
 TEST_GEN_PROGS_x86_64 += dirty_log_test
 TEST_GEN_PROGS_x86_64 += dirty_log_perf_test
diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c
new file mode 100644
index 000000000000..523c1e99ed64
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/amx_test.c
@@ -0,0 +1,448 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * amx tests
+ *
+ * Copyright (C) 2021, Intel, Inc.
+ *
+ * Tests for amx #NM exception and save/restore.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#ifndef __x86_64__
+# error This test is 64-bit only
+#endif
+
+#define VCPU_ID				0
+#define X86_FEATURE_XSAVE		(1 << 26)
+#define X86_FEATURE_OSXSAVE		(1 << 27)
+
+#define PAGE_SIZE			(1 << 12)
+#define NUM_TILES			8
+#define TILE_SIZE			1024
+#define XSAVE_SIZE			((NUM_TILES * TILE_SIZE) + PAGE_SIZE)
+
+/* Tile configuration associated: */
+#define MAX_TILES			16
+#define RESERVED_BYTES			14
+
+#define XFEATURE_XTILECFG		17
+#define XFEATURE_XTILEDATA		18
+#define XFEATURE_MASK_XTILECFG		(1 << XFEATURE_XTILECFG)
+#define XFEATURE_MASK_XTILEDATA		(1 << XFEATURE_XTILEDATA)
+#define XFEATURE_MASK_XTILE		(XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA)
+
+#define TILE_CPUID			0x1d
+#define XSTATE_CPUID			0xd
+#define TILE_PALETTE_CPUID_SUBLEAVE	0x1
+#define XSTATE_USER_STATE_SUBLEAVE	0x0
+
+#define XSAVE_HDR_OFFSET		512
+
+struct xsave_data {
+	u8 area[XSAVE_SIZE];
+} __aligned(64);
+
+struct tile_config {
+	u8  palette_id;
+	u8  start_row;
+	u8  reserved[RESERVED_BYTES];
+	u16 colsb[MAX_TILES];
+	u8  rows[MAX_TILES];
+};
+
+struct tile_data {
+	u8 data[NUM_TILES * TILE_SIZE];
+};
+
+struct xtile_info {
+	u16 bytes_per_tile;
+	u16 bytes_per_row;
+	u16 max_names;
+	u16 max_rows;
+	u32 xsave_offset;
+	u32 xsave_size;
+};
+
+static struct xtile_info xtile;
+
+static inline u64 __xgetbv(u32 index)
+{
+	u32 eax, edx;
+
+	asm volatile("xgetbv;"
+		     : "=a" (eax), "=d" (edx)
+		     : "c" (index));
+	return eax + ((u64)edx << 32);
+}
+
+static inline void __xsetbv(u32 index, u64 value)
+{
+	u32 eax = value;
+	u32 edx = value >> 32;
+
+	asm volatile("xsetbv" :: "a" (eax), "d" (edx), "c" (index));
+}
+
+static inline void __ldtilecfg(void *cfg)
+{
+	asm volatile(".byte 0xc4,0xe2,0x78,0x49,0x00"
+		     : : "a"(cfg));
+}
+
+static inline void __tileloadd(void *tile)
+{
+	asm volatile(".byte 0xc4,0xe2,0x7b,0x4b,0x04,0x10"
+		     : : "a"(tile), "d"(0));
+}
+
+static inline void __tilerelease(void)
+{
+	asm volatile(".byte 0xc4, 0xe2, 0x78, 0x49, 0xc0" ::);
+}
+
+static inline void __xsavec(struct xsave_data *data, uint64_t rfbm)
+{
+	uint32_t rfbm_lo = rfbm;
+	uint32_t rfbm_hi = rfbm >> 32;
+
+	asm volatile("xsavec (%%rdi)"
+		     : : "D" (data), "a" (rfbm_lo), "d" (rfbm_hi)
+		     : "memory");
+}
+
+static inline void check_cpuid_xsave(void)
+{
+	uint32_t eax, ebx, ecx, edx;
+
+	eax = 1;
+	ecx = 0;
+	cpuid(&eax, &ebx, &ecx, &edx);
+	if (!(ecx & X86_FEATURE_XSAVE))
+		GUEST_ASSERT(!"cpuid: no CPU xsave support!");
+	if (!(ecx & X86_FEATURE_OSXSAVE))
+		GUEST_ASSERT(!"cpuid: no OS xsave support!");
+}
+
+static bool check_xsave_supports_xtile(void)
+{
+	return __xgetbv(0) & XFEATURE_MASK_XTILE;
+}
+
+static bool enum_xtile_config(void)
+{
+	u32 eax, ebx, ecx, edx;
+
+	eax = TILE_CPUID;
+	ecx = TILE_PALETTE_CPUID_SUBLEAVE;
+
+	cpuid(&eax, &ebx, &ecx, &edx);
+	if (!eax || !ebx || !ecx)
+		return false;
+
+	xtile.max_names = ebx >> 16;
+	if (xtile.max_names < NUM_TILES)
+		return false;
+
+	xtile.bytes_per_tile = eax >> 16;
+	if (xtile.bytes_per_tile < TILE_SIZE)
+		return false;
+
+	xtile.bytes_per_row = ebx;
+	xtile.max_rows = ecx;
+
+	return true;
+}
+
+static bool enum_xsave_tile(void)
+{
+	u32 eax, ebx, ecx, edx;
+
+	eax = XSTATE_CPUID;
+	ecx = XFEATURE_XTILEDATA;
+
+	cpuid(&eax, &ebx, &ecx, &edx);
+	if (!eax || !ebx)
+		return false;
+
+	xtile.xsave_offset = ebx;
+	xtile.xsave_size = eax;
+
+	return true;
+}
+
+static bool check_xsave_size(void)
+{
+	u32 eax, ebx, ecx, edx;
+	bool valid = false;
+
+	eax = XSTATE_CPUID;
+	ecx = XSTATE_USER_STATE_SUBLEAVE;
+
+	cpuid(&eax, &ebx, &ecx, &edx);
+	if (ebx && ebx <= XSAVE_SIZE)
+		valid = true;
+
+	return valid;
+}
+
+static bool check_xtile_info(void)
+{
+	bool ret = false;
+
+	if (!check_xsave_size())
+		return ret;
+
+	if (!enum_xsave_tile())
+		return ret;
+
+	if (!enum_xtile_config())
+		return ret;
+
+	if (sizeof(struct tile_data) >= xtile.xsave_size)
+		ret = true;
+
+	return ret;
+}
+
+static void set_tilecfg(struct tile_config *cfg)
+{
+	int i;
+
+	/* Only palette id 1 */
+	cfg->palette_id = 1;
+	for (i = 0; i < xtile.max_names; i++) {
+		cfg->colsb[i] = xtile.bytes_per_row;
+		cfg->rows[i] = xtile.max_rows;
+	}
+}
+
+static void set_xstatebv(void *data, uint64_t bv)
+{
+	*(uint64_t *)(data + XSAVE_HDR_OFFSET) = bv;
+}
+
+static u64 get_xstatebv(void *data)
+{
+	return *(u64 *)(data + XSAVE_HDR_OFFSET);
+}
+
+static void init_regs(void)
+{
+	uint64_t cr4, xcr0;
+
+	/* turn on CR4.OSXSAVE */
+	cr4 = get_cr4();
+	cr4 |= X86_CR4_OSXSAVE;
+	set_cr4(cr4);
+
+	xcr0 = __xgetbv(0);
+	xcr0 |= XFEATURE_MASK_XTILE;
+	__xsetbv(0x0, xcr0);
+}
+
+static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg,
+						    struct tile_data *tiledata,
+						    struct xsave_data *xsave_data)
+{
+	init_regs();
+	check_cpuid_xsave();
+	GUEST_ASSERT(check_xsave_supports_xtile());
+	GUEST_ASSERT(check_xtile_info());
+
+	/* check xtile configs */
+	GUEST_ASSERT(xtile.xsave_offset == 2816);
+	GUEST_ASSERT(xtile.xsave_size == 8192);
+	GUEST_ASSERT(xtile.max_names == 8);
+	GUEST_ASSERT(xtile.bytes_per_tile == 1024);
+	GUEST_ASSERT(xtile.bytes_per_row == 64);
+	GUEST_ASSERT(xtile.max_rows == 16);
+	GUEST_SYNC(1);
+
+	/* xfd=0, enable amx */
+	wrmsr(MSR_IA32_XFD, 0);
+	GUEST_SYNC(2);
+	GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == 0);
+	set_tilecfg(amx_cfg);
+	__ldtilecfg(amx_cfg);
+	GUEST_SYNC(3);
+	/* Check save/restore when trap to userspace */
+	__tileloadd(tiledata);
+	GUEST_SYNC(4);
+	__tilerelease();
+	GUEST_SYNC(5);
+	/* bit 18 not in the XCOMP_BV after xsavec() */
+	set_xstatebv(xsave_data, XFEATURE_MASK_XTILEDATA);
+	__xsavec(xsave_data, XFEATURE_MASK_XTILEDATA);
+	GUEST_ASSERT((get_xstatebv(xsave_data) & XFEATURE_MASK_XTILEDATA) == 0);
+
+	/* xfd=0x40000, disable amx tiledata */
+	wrmsr(MSR_IA32_XFD, XFEATURE_MASK_XTILEDATA);
+	GUEST_SYNC(6);
+	GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILEDATA);
+	set_tilecfg(amx_cfg);
+	__ldtilecfg(amx_cfg);
+	/* Trigger #NM exception */
+	__tileloadd(tiledata);
+	GUEST_SYNC(10);
+
+	GUEST_DONE();
+}
+
+void guest_nm_handler(struct ex_regs *regs)
+{
+	/* Check if #NM is triggered by XFEATURE_MASK_XTILEDATA */
+	GUEST_SYNC(7);
+	GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILEDATA);
+	GUEST_SYNC(8);
+	GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILEDATA);
+	/* Clear xfd_err */
+	wrmsr(MSR_IA32_XFD_ERR, 0);
+	/* xfd=0, enable amx */
+	wrmsr(MSR_IA32_XFD, 0);
+	GUEST_SYNC(9);
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_cpuid_entry2 *entry;
+	struct kvm_regs regs1, regs2;
+	bool amx_supported = false;
+	struct kvm_vm *vm;
+	struct kvm_run *run;
+	struct kvm_x86_state *state;
+	int xsave_restore_size = 0;
+	vm_vaddr_t amx_cfg, tiledata, xsavedata;
+	struct ucall uc;
+	u32 amx_offset;
+	int stage, ret;
+
+	/* Create VM */
+	vm = vm_create_default(VCPU_ID, 0, guest_code);
+
+	entry = kvm_get_supported_cpuid_entry(1);
+	if (!(entry->ecx & X86_FEATURE_XSAVE)) {
+		print_skip("XSAVE feature not supported");
+		exit(KSFT_SKIP);
+	}
+
+	if (kvm_get_cpuid_max_basic() >= 0xd) {
+		entry = kvm_get_supported_cpuid_index(0xd, 0);
+		amx_supported = entry && !!(entry->eax & XFEATURE_MASK_XTILE);
+		if (!amx_supported) {
+			print_skip("AMX is not supported by the vCPU (eax=0x%x)", entry->eax);
+			exit(KSFT_SKIP);
+		}
+		/* Get xsave/restore max size */
+		xsave_restore_size = entry->ecx;
+	}
+
+	run = vcpu_state(vm, VCPU_ID);
+	vcpu_regs_get(vm, VCPU_ID, &regs1);
+
+	/* Register #NM handler */
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(vm, VCPU_ID);
+	vm_install_exception_handler(vm, NM_VECTOR, guest_nm_handler);
+
+	/* amx cfg for guest_code */
+	amx_cfg = vm_vaddr_alloc_page(vm);
+	memset(addr_gva2hva(vm, amx_cfg), 0x0, getpagesize());
+
+	/* amx tiledata for guest_code */
+	tiledata = vm_vaddr_alloc_pages(vm, 2);
+	memset(addr_gva2hva(vm, tiledata), rand() | 1, 2 * getpagesize());
+
+	/* xsave data for guest_code */
+	xsavedata = vm_vaddr_alloc_pages(vm, 3);
+	memset(addr_gva2hva(vm, xsavedata), 0, 3 * getpagesize());
+	vcpu_args_set(vm, VCPU_ID, 3, amx_cfg, tiledata, xsavedata);
+
+	for (stage = 1; ; stage++) {
+		_vcpu_run(vm, VCPU_ID);
+		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+			    "Stage %d: unexpected exit reason: %u (%s),\n",
+			    stage, run->exit_reason,
+			    exit_reason_str(run->exit_reason));
+
+		switch (get_ucall(vm, VCPU_ID, &uc)) {
+		case UCALL_ABORT:
+			TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
+				  __FILE__, uc.args[1]);
+			/* NOT REACHED */
+		case UCALL_SYNC:
+			switch (uc.args[1]) {
+			case 1:
+			case 2:
+			case 3:
+			case 5:
+			case 6:
+			case 7:
+			case 8:
+				fprintf(stderr, "GUEST_SYNC(%ld)\n", uc.args[1]);
+				break;
+			case 4:
+			case 10:
+				fprintf(stderr,
+				"GUEST_SYNC(%ld), check save/restore status\n", uc.args[1]);
+
+				/* Compacted mode, get amx offset by xsave area
+				 * size subtract 8K amx size.
+				 */
+				amx_offset = xsave_restore_size - NUM_TILES*TILE_SIZE;
+				state = vcpu_save_state(vm, VCPU_ID);
+				void *amx_start = (void *)state->xsave + amx_offset;
+				void *tiles_data = (void *)addr_gva2hva(vm, tiledata);
+				/* Only check TMM0 register, 1 tile */
+				ret = memcmp(amx_start, tiles_data, TILE_SIZE);
+				TEST_ASSERT(ret == 0, "memcmp failed, ret=%d\n", ret);
+				kvm_x86_state_cleanup(state);
+				break;
+			case 9:
+				fprintf(stderr,
+				"GUEST_SYNC(%ld), #NM exception and enable amx\n", uc.args[1]);
+				break;
+			}
+			break;
+		case UCALL_DONE:
+			fprintf(stderr, "UCALL_DONE\n");
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+
+		state = vcpu_save_state(vm, VCPU_ID);
+		memset(&regs1, 0, sizeof(regs1));
+		vcpu_regs_get(vm, VCPU_ID, &regs1);
+
+		kvm_vm_release(vm);
+
+		/* Restore state in a new VM.  */
+		kvm_vm_restart(vm, O_RDWR);
+		vm_vcpu_add(vm, VCPU_ID);
+		vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+		vcpu_load_state(vm, VCPU_ID, state);
+		run = vcpu_state(vm, VCPU_ID);
+		kvm_x86_state_cleanup(state);
+
+		memset(&regs2, 0, sizeof(regs2));
+		vcpu_regs_get(vm, VCPU_ID, &regs2);
+		TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
+			    "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
+			    (ulong) regs2.rdi, (ulong) regs2.rsi);
+	}
+done:
+	kvm_vm_free(vm);
+}
-- 
cgit v1.2.3


From 88a71086c48ae98e93c0208044827621e9717f7e Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Thu, 6 Jan 2022 17:15:08 +0530
Subject: tools/bpf: Rename 'struct event' to avoid naming conflict

On ppc64le, trying to build bpf seltests throws the below warning:
  In file included from runqslower.bpf.c:5:
  ./runqslower.h:7:8: error: redefinition of 'event'
  struct event {
	 ^
  /home/naveen/linux/tools/testing/selftests/bpf/tools/build/runqslower/vmlinux.h:156602:8:
  note: previous definition is here
  struct event {
	 ^

This happens since 'struct event' is defined in
drivers/net/ethernet/alteon/acenic.h . Rename the one in runqslower to a
more appropriate 'runq_event' to avoid the naming conflict.

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/c13cb3767d26257ca4387b8296b632b433a58db6.1641468127.git.naveen.n.rao@linux.vnet.ibm.com
---
 tools/bpf/runqslower/runqslower.bpf.c | 2 +-
 tools/bpf/runqslower/runqslower.c     | 2 +-
 tools/bpf/runqslower/runqslower.h     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/runqslower/runqslower.bpf.c b/tools/bpf/runqslower/runqslower.bpf.c
index ab9353f2fd46..9a5c1f008fe6 100644
--- a/tools/bpf/runqslower/runqslower.bpf.c
+++ b/tools/bpf/runqslower/runqslower.bpf.c
@@ -68,7 +68,7 @@ int handle__sched_switch(u64 *ctx)
 	 */
 	struct task_struct *prev = (struct task_struct *)ctx[1];
 	struct task_struct *next = (struct task_struct *)ctx[2];
-	struct event event = {};
+	struct runq_event event = {};
 	u64 *tsp, delta_us;
 	long state;
 	u32 pid;
diff --git a/tools/bpf/runqslower/runqslower.c b/tools/bpf/runqslower/runqslower.c
index 2414cc764461..d78f4148597f 100644
--- a/tools/bpf/runqslower/runqslower.c
+++ b/tools/bpf/runqslower/runqslower.c
@@ -100,7 +100,7 @@ static int bump_memlock_rlimit(void)
 
 void handle_event(void *ctx, int cpu, void *data, __u32 data_sz)
 {
-	const struct event *e = data;
+	const struct runq_event *e = data;
 	struct tm *tm;
 	char ts[32];
 	time_t t;
diff --git a/tools/bpf/runqslower/runqslower.h b/tools/bpf/runqslower/runqslower.h
index 9db225425e5f..4f70f07200c2 100644
--- a/tools/bpf/runqslower/runqslower.h
+++ b/tools/bpf/runqslower/runqslower.h
@@ -4,7 +4,7 @@
 
 #define TASK_COMM_LEN 16
 
-struct event {
+struct runq_event {
 	char task[TASK_COMM_LEN];
 	__u64 delta_us;
 	pid_t pid;
-- 
cgit v1.2.3


From 2c769ed7137a75a8297936fb54e6ff1f56d3d0f1 Mon Sep 17 00:00:00 2001
From: chiminghao <chi.minghao@zte.com.cn>
Date: Fri, 14 Jan 2022 14:05:48 -0800
Subject: tools/testing/selftests/vm/userfaultfd.c: use swap() to make code
 cleaner

Fix the following coccicheck REVIEW:

 tools/testing/selftests/vm/userfaultfd.c:1531:21-22:use swap() to make code cleaner

Link: https://lkml.kernel.org/r/20211124031632.35317-1-chi.minghao@zte.com.cn
Signed-off-by: chiminghao <chi.minghao@zte.com.cn>
Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/vm/userfaultfd.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 9354a5e0321c..990f7aecf4a3 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -1417,7 +1417,6 @@ static void userfaultfd_pagemap_test(unsigned int test_pgsize)
 static int userfaultfd_stress(void)
 {
 	void *area;
-	char *tmp_area;
 	unsigned long nr;
 	struct uffdio_register uffdio_register;
 	struct uffd_stats uffd_stats[nr_cpus];
@@ -1528,13 +1527,9 @@ static int userfaultfd_stress(void)
 					    count_verify[nr], nr);
 
 		/* prepare next bounce */
-		tmp_area = area_src;
-		area_src = area_dst;
-		area_dst = tmp_area;
+		swap(area_src, area_dst);
 
-		tmp_area = area_src_alias;
-		area_src_alias = area_dst_alias;
-		area_dst_alias = tmp_area;
+		swap(area_src_alias, area_dst_alias);
 
 		uffd_stats_report(uffd_stats, nr_cpus);
 	}
-- 
cgit v1.2.3


From f77a286de48c04f7dd248b41584645afc3613bb1 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed@google.com>
Date: Fri, 14 Jan 2022 14:07:52 -0800
Subject: mm, hugepages: make memory size variable in hugepage-mremap selftest

The hugetlb vma mremap() test currently maps 1GB of memory to trigger
pmd sharing and make sure that 'unshare' path in mremap code works.  The
test originally only mapped 10MB of memory (as specified by the header
comment) but was later modified to 1GB to tackle this case.

However, not all machines will have 1GB of memory to spare for this
test.  Adding a mapping size arg will allow run_vmtest.sh to pass an
adequate mapping size, while allowing users to run the test
independently with arbitrary size mappings.

Link: https://lkml.kernel.org/r/20211124203805.3700355-1-yosryahmed@google.com
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/vm/hugepage-mremap.c | 46 ++++++++++++++++++----------
 tools/testing/selftests/vm/run_vmtests.sh    |  2 +-
 2 files changed, 31 insertions(+), 17 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vm/hugepage-mremap.c b/tools/testing/selftests/vm/hugepage-mremap.c
index 257df94697a5..2a7c33631a29 100644
--- a/tools/testing/selftests/vm/hugepage-mremap.c
+++ b/tools/testing/selftests/vm/hugepage-mremap.c
@@ -4,7 +4,11 @@
  *
  * Example of remapping huge page memory in a user application using the
  * mremap system call.  Code assumes a hugetlbfs filesystem is mounted
- * at './huge'.  The code will use 10MB worth of huge pages.
+ * at './huge'.  The amount of memory used by this test is decided by a command
+ * line argument in MBs. If missing, the default amount is 10MB.
+ *
+ * To make sure the test triggers pmd sharing and goes through the 'unshare'
+ * path in the mremap code use 1GB (1024) or more.
  */
 
 #define _GNU_SOURCE
@@ -18,8 +22,10 @@
 #include <linux/userfaultfd.h>
 #include <sys/ioctl.h>
 
-#define LENGTH (1UL * 1024 * 1024 * 1024)
+#define DEFAULT_LENGTH_MB 10UL
+#define MB_TO_BYTES(x) (x * 1024 * 1024)
 
+#define FILE_NAME "huge/hugepagefile"
 #define PROTECTION (PROT_READ | PROT_WRITE | PROT_EXEC)
 #define FLAGS (MAP_SHARED | MAP_ANONYMOUS)
 
@@ -28,20 +34,20 @@ static void check_bytes(char *addr)
 	printf("First hex is %x\n", *((unsigned int *)addr));
 }
 
-static void write_bytes(char *addr)
+static void write_bytes(char *addr, size_t len)
 {
 	unsigned long i;
 
-	for (i = 0; i < LENGTH; i++)
+	for (i = 0; i < len; i++)
 		*(addr + i) = (char)i;
 }
 
-static int read_bytes(char *addr)
+static int read_bytes(char *addr, size_t len)
 {
 	unsigned long i;
 
 	check_bytes(addr);
-	for (i = 0; i < LENGTH; i++)
+	for (i = 0; i < len; i++)
 		if (*(addr + i) != (char)i) {
 			printf("Mismatch at %lu\n", i);
 			return 1;
@@ -99,11 +105,19 @@ static void register_region_with_uffd(char *addr, size_t len)
 	}
 }
 
-int main(void)
+int main(int argc, char *argv[])
 {
+	/* Read memory length as the first arg if valid, otherwise fallback to
+	 * the default length. Any additional args are ignored.
+	 */
+	size_t length = argc > 1 ? (size_t)atoi(argv[1]) : 0UL;
+
+	length = length > 0 ? length : DEFAULT_LENGTH_MB;
+	length = MB_TO_BYTES(length);
+
 	int ret = 0;
 
-	int fd = open("/huge/test", O_CREAT | O_RDWR, 0755);
+	int fd = open(FILE_NAME, O_CREAT | O_RDWR, 0755);
 
 	if (fd < 0) {
 		perror("Open failed");
@@ -112,7 +126,7 @@ int main(void)
 
 	/* mmap to a PUD aligned address to hopefully trigger pmd sharing. */
 	unsigned long suggested_addr = 0x7eaa40000000;
-	void *haddr = mmap((void *)suggested_addr, LENGTH, PROTECTION,
+	void *haddr = mmap((void *)suggested_addr, length, PROTECTION,
 			   MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0);
 	printf("Map haddr: Returned address is %p\n", haddr);
 	if (haddr == MAP_FAILED) {
@@ -122,7 +136,7 @@ int main(void)
 
 	/* mmap again to a dummy address to hopefully trigger pmd sharing. */
 	suggested_addr = 0x7daa40000000;
-	void *daddr = mmap((void *)suggested_addr, LENGTH, PROTECTION,
+	void *daddr = mmap((void *)suggested_addr, length, PROTECTION,
 			   MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0);
 	printf("Map daddr: Returned address is %p\n", daddr);
 	if (daddr == MAP_FAILED) {
@@ -132,16 +146,16 @@ int main(void)
 
 	suggested_addr = 0x7faa40000000;
 	void *vaddr =
-		mmap((void *)suggested_addr, LENGTH, PROTECTION, FLAGS, -1, 0);
+		mmap((void *)suggested_addr, length, PROTECTION, FLAGS, -1, 0);
 	printf("Map vaddr: Returned address is %p\n", vaddr);
 	if (vaddr == MAP_FAILED) {
 		perror("mmap2");
 		exit(1);
 	}
 
-	register_region_with_uffd(haddr, LENGTH);
+	register_region_with_uffd(haddr, length);
 
-	void *addr = mremap(haddr, LENGTH, LENGTH,
+	void *addr = mremap(haddr, length, length,
 			    MREMAP_MAYMOVE | MREMAP_FIXED, vaddr);
 	if (addr == MAP_FAILED) {
 		perror("mremap");
@@ -150,10 +164,10 @@ int main(void)
 
 	printf("Mremap: Returned address is %p\n", addr);
 	check_bytes(addr);
-	write_bytes(addr);
-	ret = read_bytes(addr);
+	write_bytes(addr, length);
+	ret = read_bytes(addr, length);
 
-	munmap(addr, LENGTH);
+	munmap(addr, length);
 
 	return ret;
 }
diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh
index a24d30af3094..75d401741394 100755
--- a/tools/testing/selftests/vm/run_vmtests.sh
+++ b/tools/testing/selftests/vm/run_vmtests.sh
@@ -111,7 +111,7 @@ fi
 echo "-----------------------"
 echo "running hugepage-mremap"
 echo "-----------------------"
-./hugepage-mremap
+./hugepage-mremap 256
 if [ $? -ne 0 ]; then
 	echo "[FAIL]"
 	exitcode=1
-- 
cgit v1.2.3


From 209376ed2a8431ccb4c40fdcef11194fc1e749b0 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Fri, 14 Jan 2022 14:07:58 -0800
Subject: selftests/vm: make charge_reserved_hugetlb.sh work with existing
 cgroup setting

The hugetlb cgroup reservation test charge_reserved_hugetlb.sh assume
that no cgroup filesystems are mounted before running the test.  That is
not true in many cases.  As a result, the test fails to run.  Fix that
by querying the current cgroup mount setting and using the existing
cgroup setup instead before attempting to freshly mount a cgroup
filesystem.

Similar change is also made for hugetlb_reparenting_test.sh as well,
though it still has problem if cgroup v2 isn't used.

The patched test scripts were run on a centos 8 based system to verify
that they ran properly.

Link: https://lkml.kernel.org/r/20220106201359.1646575-1-longman@redhat.com
Fixes: 29750f71a9b4 ("hugetlb_cgroup: add hugetlb_cgroup reservation tests")
Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Mina Almasry <almasrymina@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 .../selftests/vm/charge_reserved_hugetlb.sh        | 34 +++++++++++++---------
 .../selftests/vm/hugetlb_reparenting_test.sh       | 21 ++++++++-----
 tools/testing/selftests/vm/write_hugetlb_memory.sh |  2 +-
 3 files changed, 34 insertions(+), 23 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vm/charge_reserved_hugetlb.sh b/tools/testing/selftests/vm/charge_reserved_hugetlb.sh
index fe8fcfb334e0..a5cb4b09a46c 100644
--- a/tools/testing/selftests/vm/charge_reserved_hugetlb.sh
+++ b/tools/testing/selftests/vm/charge_reserved_hugetlb.sh
@@ -24,19 +24,23 @@ if [[ "$1" == "-cgroup-v2" ]]; then
   reservation_usage_file=rsvd.current
 fi
 
-cgroup_path=/dev/cgroup/memory
-if [[ ! -e $cgroup_path ]]; then
-  mkdir -p $cgroup_path
-  if [[ $cgroup2 ]]; then
+if [[ $cgroup2 ]]; then
+  cgroup_path=$(mount -t cgroup2 | head -1 | awk -e '{print $3}')
+  if [[ -z "$cgroup_path" ]]; then
+    cgroup_path=/dev/cgroup/memory
     mount -t cgroup2 none $cgroup_path
-  else
+    do_umount=1
+  fi
+  echo "+hugetlb" >$cgroup_path/cgroup.subtree_control
+else
+  cgroup_path=$(mount -t cgroup | grep ",hugetlb" | awk -e '{print $3}')
+  if [[ -z "$cgroup_path" ]]; then
+    cgroup_path=/dev/cgroup/memory
     mount -t cgroup memory,hugetlb $cgroup_path
+    do_umount=1
   fi
 fi
-
-if [[ $cgroup2 ]]; then
-  echo "+hugetlb" >/dev/cgroup/memory/cgroup.subtree_control
-fi
+export cgroup_path
 
 function cleanup() {
   if [[ $cgroup2 ]]; then
@@ -108,7 +112,7 @@ function setup_cgroup() {
 
 function wait_for_hugetlb_memory_to_get_depleted() {
   local cgroup="$1"
-  local path="/dev/cgroup/memory/$cgroup/hugetlb.${MB}MB.$reservation_usage_file"
+  local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file"
   # Wait for hugetlbfs memory to get depleted.
   while [ $(cat $path) != 0 ]; do
     echo Waiting for hugetlb memory to get depleted.
@@ -121,7 +125,7 @@ function wait_for_hugetlb_memory_to_get_reserved() {
   local cgroup="$1"
   local size="$2"
 
-  local path="/dev/cgroup/memory/$cgroup/hugetlb.${MB}MB.$reservation_usage_file"
+  local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file"
   # Wait for hugetlbfs memory to get written.
   while [ $(cat $path) != $size ]; do
     echo Waiting for hugetlb memory reservation to reach size $size.
@@ -134,7 +138,7 @@ function wait_for_hugetlb_memory_to_get_written() {
   local cgroup="$1"
   local size="$2"
 
-  local path="/dev/cgroup/memory/$cgroup/hugetlb.${MB}MB.$fault_usage_file"
+  local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file"
   # Wait for hugetlbfs memory to get written.
   while [ $(cat $path) != $size ]; do
     echo Waiting for hugetlb memory to reach size $size.
@@ -574,5 +578,7 @@ for populate in "" "-o"; do
   done     # populate
 done       # method
 
-umount $cgroup_path
-rmdir $cgroup_path
+if [[ $do_umount ]]; then
+  umount $cgroup_path
+  rmdir $cgroup_path
+fi
diff --git a/tools/testing/selftests/vm/hugetlb_reparenting_test.sh b/tools/testing/selftests/vm/hugetlb_reparenting_test.sh
index 4a9a3afe9fd4..bf2d2a684edf 100644
--- a/tools/testing/selftests/vm/hugetlb_reparenting_test.sh
+++ b/tools/testing/selftests/vm/hugetlb_reparenting_test.sh
@@ -18,19 +18,24 @@ if [[ "$1" == "-cgroup-v2" ]]; then
   usage_file=current
 fi
 
-CGROUP_ROOT='/dev/cgroup/memory'
-MNT='/mnt/huge/'
 
-if [[ ! -e $CGROUP_ROOT ]]; then
-  mkdir -p $CGROUP_ROOT
-  if [[ $cgroup2 ]]; then
+if [[ $cgroup2 ]]; then
+  CGROUP_ROOT=$(mount -t cgroup2 | head -1 | awk -e '{print $3}')
+  if [[ -z "$CGROUP_ROOT" ]]; then
+    CGROUP_ROOT=/dev/cgroup/memory
     mount -t cgroup2 none $CGROUP_ROOT
-    sleep 1
-    echo "+hugetlb +memory" >$CGROUP_ROOT/cgroup.subtree_control
-  else
+    do_umount=1
+  fi
+  echo "+hugetlb +memory" >$CGROUP_ROOT/cgroup.subtree_control
+else
+  CGROUP_ROOT=$(mount -t cgroup | grep ",hugetlb" | awk -e '{print $3}')
+  if [[ -z "$CGROUP_ROOT" ]]; then
+    CGROUP_ROOT=/dev/cgroup/memory
     mount -t cgroup memory,hugetlb $CGROUP_ROOT
+    do_umount=1
   fi
 fi
+MNT='/mnt/huge/'
 
 function get_machine_hugepage_size() {
   hpz=$(grep -i hugepagesize /proc/meminfo)
diff --git a/tools/testing/selftests/vm/write_hugetlb_memory.sh b/tools/testing/selftests/vm/write_hugetlb_memory.sh
index d3d0d108924d..70a02301f4c2 100644
--- a/tools/testing/selftests/vm/write_hugetlb_memory.sh
+++ b/tools/testing/selftests/vm/write_hugetlb_memory.sh
@@ -14,7 +14,7 @@ want_sleep=$8
 reserve=$9
 
 echo "Putting task in cgroup '$cgroup'"
-echo $$ > /dev/cgroup/memory/"$cgroup"/cgroup.procs
+echo $$ > ${cgroup_path:-/dev/cgroup/memory}/"$cgroup"/cgroup.procs
 
 echo "Method is $method"
 
-- 
cgit v1.2.3


From fab51505480058dcb63d973515c748fa6c437cab Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 14 Jan 2022 14:08:01 -0800
Subject: selftests/uffd: allow EINTR/EAGAIN

This allow test to continue with interruptions like gdb.

Link: https://lkml.kernel.org/r/20211115135219.85881-1-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Axel Rasmussen <axelrasmussen@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/vm/userfaultfd.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 990f7aecf4a3..c9eddd25ba17 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -648,7 +648,7 @@ static int uffd_read_msg(int ufd, struct uffd_msg *msg)
 
 	if (ret != sizeof(*msg)) {
 		if (ret < 0) {
-			if (errno == EAGAIN)
+			if (errno == EAGAIN || errno == EINTR)
 				return 1;
 			err("blocking read error");
 		} else {
@@ -724,8 +724,11 @@ static void *uffd_poll_thread(void *arg)
 
 	for (;;) {
 		ret = poll(pollfd, 2, -1);
-		if (ret <= 0)
+		if (ret <= 0) {
+			if (errno == EINTR || errno == EAGAIN)
+				continue;
 			err("poll error: %d", ret);
+		}
 		if (pollfd[1].revents & POLLIN) {
 			if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
 				err("read pipefd error");
-- 
cgit v1.2.3


From 692b55815cf970eb4ce428f48f2c94d1800acc4b Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Fri, 14 Jan 2022 14:08:04 -0800
Subject: userfaultfd/selftests: clean up hugetlb allocation code

The message for commit f5c73297181c ("userfaultfd/selftests: fix hugetlb
area allocations") says there is no need to create a hugetlb file in the
non-shared testing case.  However, the commit did not actually change
the code to prevent creation of the file.

While it is technically true that there is no need to create and use a
hugetlb file in the case of non-shared-testing, it is useful.  This is
because 'hole punching' of a hugetlb file has the potentially incorrect
side effect of also removing pages from private mappings.  The
userfaultfd test relies on this side effect for removing pages from the
destination buffer during rounds of stress testing.

Remove the incomplete code that was added to deal with no hugetlb file.
Just keep the code that prevents reserves from being created for the
destination area.

Link: https://lkml.kernel.org/r/20220104021729.111006-1-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Axel Rasmussen <axelrasmussen@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/vm/userfaultfd.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index c9eddd25ba17..d3fd24f9fae8 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -87,7 +87,7 @@ static bool test_uffdio_minor = false;
 
 static bool map_shared;
 static int shm_fd;
-static int huge_fd = -1;	/* only used for hugetlb_shared test */
+static int huge_fd;
 static char *huge_fd_off0;
 static unsigned long long *count_verify;
 static int uffd = -1;
@@ -223,9 +223,6 @@ static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
 
 static void hugetlb_release_pages(char *rel_area)
 {
-	if (huge_fd == -1)
-		return;
-
 	if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
 		      rel_area == huge_fd_off0 ? 0 : nr_pages * page_size,
 		      nr_pages * page_size))
@@ -238,17 +235,17 @@ static void hugetlb_allocate_area(void **alloc_area)
 	char **alloc_area_alias;
 
 	*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
-			   map_shared ? MAP_SHARED :
-			   MAP_PRIVATE | MAP_HUGETLB |
+			   (map_shared ? MAP_SHARED : MAP_PRIVATE) |
+			   MAP_HUGETLB |
 			   (*alloc_area == area_src ? 0 : MAP_NORESERVE),
-			   huge_fd,
-			   *alloc_area == area_src ? 0 : nr_pages * page_size);
+			   huge_fd, *alloc_area == area_src ? 0 :
+			   nr_pages * page_size);
 	if (*alloc_area == MAP_FAILED)
 		err("mmap of hugetlbfs file failed");
 
 	if (map_shared) {
 		area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
-				  MAP_SHARED,
+				  MAP_SHARED | MAP_HUGETLB,
 				  huge_fd, *alloc_area == area_src ? 0 :
 				  nr_pages * page_size);
 		if (area_alias == MAP_FAILED)
-- 
cgit v1.2.3


From 87c01d57fa23de82fff593a7d070933d08755801 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Fri, 14 Jan 2022 14:09:31 -0800
Subject: mm/hmm.c: allow VM_MIXEDMAP to work with hmm_range_fault

hmm_range_fault() can be used instead of get_user_pages() for devices
which allow faulting however unlike get_user_pages() it will return an
error when used on a VM_MIXEDMAP range.

To make hmm_range_fault() more closely match get_user_pages() remove
this restriction.  This requires dealing with the !ARCH_HAS_PTE_SPECIAL
case in hmm_vma_handle_pte().  Rather than replicating the logic of
vm_normal_page() call it directly and do a check for the zero pfn
similar to what get_user_pages() currently does.

Also add a test to hmm selftest to verify functionality.

Link: https://lkml.kernel.org/r/20211104012001.2555676-1-apopple@nvidia.com
Fixes: da4c3c735ea4 ("mm/hmm/mirror: helper to snapshot CPU page table")
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/test_hmm.c                         | 24 +++++++++++++++++++
 mm/hmm.c                               |  5 ++--
 tools/testing/selftests/vm/hmm-tests.c | 42 ++++++++++++++++++++++++++++++++++
 3 files changed, 69 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index e2ce8f9b7605..767538089a62 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -1086,9 +1086,33 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp,
 	return 0;
 }
 
+static int dmirror_fops_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	unsigned long addr;
+
+	for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
+		struct page *page;
+		int ret;
+
+		page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+		if (!page)
+			return -ENOMEM;
+
+		ret = vm_insert_page(vma, addr, page);
+		if (ret) {
+			__free_page(page);
+			return ret;
+		}
+		put_page(page);
+	}
+
+	return 0;
+}
+
 static const struct file_operations dmirror_fops = {
 	.open		= dmirror_fops_open,
 	.release	= dmirror_fops_release,
+	.mmap		= dmirror_fops_mmap,
 	.unlocked_ioctl = dmirror_fops_unlocked_ioctl,
 	.llseek		= default_llseek,
 	.owner		= THIS_MODULE,
diff --git a/mm/hmm.c b/mm/hmm.c
index 842e26599238..bd56641c79d4 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -300,7 +300,8 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 	 * Since each architecture defines a struct page for the zero page, just
 	 * fall through and treat it like a normal page.
 	 */
-	if (pte_special(pte) && !pte_devmap(pte) &&
+	if (!vm_normal_page(walk->vma, addr, pte) &&
+	    !pte_devmap(pte) &&
 	    !is_zero_pfn(pte_pfn(pte))) {
 		if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) {
 			pte_unmap(ptep);
@@ -518,7 +519,7 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end,
 	struct hmm_range *range = hmm_vma_walk->range;
 	struct vm_area_struct *vma = walk->vma;
 
-	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) &&
+	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)) &&
 	    vma->vm_flags & VM_READ)
 		return 0;
 
diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c
index 864f126ffd78..203323967b50 100644
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ b/tools/testing/selftests/vm/hmm-tests.c
@@ -1248,6 +1248,48 @@ TEST_F(hmm, anon_teardown)
 	}
 }
 
+/*
+ * Test memory snapshot without faulting in pages accessed by the device.
+ */
+TEST_F(hmm, mixedmap)
+{
+	struct hmm_buffer *buffer;
+	unsigned long npages;
+	unsigned long size;
+	unsigned char *m;
+	int ret;
+
+	npages = 1;
+	size = npages << self->page_shift;
+
+	buffer = malloc(sizeof(*buffer));
+	ASSERT_NE(buffer, NULL);
+
+	buffer->fd = -1;
+	buffer->size = size;
+	buffer->mirror = malloc(npages);
+	ASSERT_NE(buffer->mirror, NULL);
+
+
+	/* Reserve a range of addresses. */
+	buffer->ptr = mmap(NULL, size,
+			   PROT_READ | PROT_WRITE,
+			   MAP_PRIVATE,
+			   self->fd, 0);
+	ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+	/* Simulate a device snapshotting CPU pagetables. */
+	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+
+	/* Check what the device saw. */
+	m = buffer->mirror;
+	ASSERT_EQ(m[0], HMM_DMIRROR_PROT_READ);
+
+	hmm_buffer_free(buffer);
+}
+
 /*
  * Test memory snapshot without faulting in pages accessed by the device.
  */
-- 
cgit v1.2.3


From 4ade0818cf048bb166e875ed4f8b456e6c2c7b3c Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sat, 14 Aug 2021 14:17:04 -0700
Subject: tools: sync tools/bitmap with mother linux

Remove tools/include/asm-generic/bitops/find.h and copy
include/linux/bitmap.h to tools. find_*_le() functions are not copied
because not needed in tools.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Tested-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
---
 MAINTAINERS                             |   2 +-
 tools/include/asm-generic/bitops.h      |   1 -
 tools/include/asm-generic/bitops/find.h | 145 ----------------------
 tools/include/linux/bitmap.h            |   7 +-
 tools/include/linux/find.h              | 214 ++++++++++++++++++++++++++++++++
 tools/lib/find_bit.c                    |  20 +++
 6 files changed, 239 insertions(+), 150 deletions(-)
 delete mode 100644 tools/include/asm-generic/bitops/find.h
 create mode 100644 tools/include/linux/find.h

(limited to 'tools')

diff --git a/MAINTAINERS b/MAINTAINERS
index 4646786f5302..092b1c6fd2f1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3365,8 +3365,8 @@ F:	lib/bitmap.c
 F:	lib/find_bit.c
 F:	lib/find_bit_benchmark.c
 F:	lib/test_bitmap.c
-F:	tools/include/asm-generic/bitops/find.h
 F:	tools/include/linux/bitmap.h
+F:	tools/include/linux/find.h
 F:	tools/lib/bitmap.c
 F:	tools/lib/find_bit.c
 
diff --git a/tools/include/asm-generic/bitops.h b/tools/include/asm-generic/bitops.h
index 5d2ab38965cc..9ab313e93555 100644
--- a/tools/include/asm-generic/bitops.h
+++ b/tools/include/asm-generic/bitops.h
@@ -18,7 +18,6 @@
 #include <asm-generic/bitops/fls.h>
 #include <asm-generic/bitops/__fls.h>
 #include <asm-generic/bitops/fls64.h>
-#include <asm-generic/bitops/find.h>
 
 #ifndef _TOOLS_LINUX_BITOPS_H_
 #error only <linux/bitops.h> can be included directly
diff --git a/tools/include/asm-generic/bitops/find.h b/tools/include/asm-generic/bitops/find.h
deleted file mode 100644
index 6481fd11012a..000000000000
--- a/tools/include/asm-generic/bitops/find.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _TOOLS_LINUX_ASM_GENERIC_BITOPS_FIND_H_
-#define _TOOLS_LINUX_ASM_GENERIC_BITOPS_FIND_H_
-
-extern unsigned long _find_next_bit(const unsigned long *addr1,
-		const unsigned long *addr2, unsigned long nbits,
-		unsigned long start, unsigned long invert, unsigned long le);
-extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size);
-extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size);
-extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size);
-
-#ifndef find_next_bit
-/**
- * find_next_bit - find the next set bit in a memory region
- * @addr: The address to base the search on
- * @offset: The bitnumber to start searching at
- * @size: The bitmap size in bits
- *
- * Returns the bit number for the next set bit
- * If no bits are set, returns @size.
- */
-static inline
-unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
-			    unsigned long offset)
-{
-	if (small_const_nbits(size)) {
-		unsigned long val;
-
-		if (unlikely(offset >= size))
-			return size;
-
-		val = *addr & GENMASK(size - 1, offset);
-		return val ? __ffs(val) : size;
-	}
-
-	return _find_next_bit(addr, NULL, size, offset, 0UL, 0);
-}
-#endif
-
-#ifndef find_next_and_bit
-/**
- * find_next_and_bit - find the next set bit in both memory regions
- * @addr1: The first address to base the search on
- * @addr2: The second address to base the search on
- * @offset: The bitnumber to start searching at
- * @size: The bitmap size in bits
- *
- * Returns the bit number for the next set bit
- * If no bits are set, returns @size.
- */
-static inline
-unsigned long find_next_and_bit(const unsigned long *addr1,
-		const unsigned long *addr2, unsigned long size,
-		unsigned long offset)
-{
-	if (small_const_nbits(size)) {
-		unsigned long val;
-
-		if (unlikely(offset >= size))
-			return size;
-
-		val = *addr1 & *addr2 & GENMASK(size - 1, offset);
-		return val ? __ffs(val) : size;
-	}
-
-	return _find_next_bit(addr1, addr2, size, offset, 0UL, 0);
-}
-#endif
-
-#ifndef find_next_zero_bit
-/**
- * find_next_zero_bit - find the next cleared bit in a memory region
- * @addr: The address to base the search on
- * @offset: The bitnumber to start searching at
- * @size: The bitmap size in bits
- *
- * Returns the bit number of the next zero bit
- * If no bits are zero, returns @size.
- */
-static inline
-unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
-				 unsigned long offset)
-{
-	if (small_const_nbits(size)) {
-		unsigned long val;
-
-		if (unlikely(offset >= size))
-			return size;
-
-		val = *addr | ~GENMASK(size - 1, offset);
-		return val == ~0UL ? size : ffz(val);
-	}
-
-	return _find_next_bit(addr, NULL, size, offset, ~0UL, 0);
-}
-#endif
-
-#ifndef find_first_bit
-
-/**
- * find_first_bit - find the first set bit in a memory region
- * @addr: The address to start the search at
- * @size: The maximum number of bits to search
- *
- * Returns the bit number of the first set bit.
- * If no bits are set, returns @size.
- */
-static inline
-unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
-{
-	if (small_const_nbits(size)) {
-		unsigned long val = *addr & GENMASK(size - 1, 0);
-
-		return val ? __ffs(val) : size;
-	}
-
-	return _find_first_bit(addr, size);
-}
-
-#endif /* find_first_bit */
-
-#ifndef find_first_zero_bit
-
-/**
- * find_first_zero_bit - find the first cleared bit in a memory region
- * @addr: The address to start the search at
- * @size: The maximum number of bits to search
- *
- * Returns the bit number of the first cleared bit.
- * If no bits are zero, returns @size.
- */
-static inline
-unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
-{
-	if (small_const_nbits(size)) {
-		unsigned long val = *addr | ~GENMASK(size - 1, 0);
-
-		return val == ~0UL ? size : ffz(val);
-	}
-
-	return _find_first_zero_bit(addr, size);
-}
-#endif
-
-#endif /*_TOOLS_LINUX_ASM_GENERIC_BITOPS_FIND_H_ */
diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h
index 95611df1d26e..ea97804d04d4 100644
--- a/tools/include/linux/bitmap.h
+++ b/tools/include/linux/bitmap.h
@@ -1,9 +1,10 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _PERF_BITOPS_H
-#define _PERF_BITOPS_H
+#ifndef _TOOLS_LINUX_BITMAP_H
+#define _TOOLS_LINUX_BITMAP_H
 
 #include <string.h>
 #include <linux/bitops.h>
+#include <linux/find.h>
 #include <stdlib.h>
 #include <linux/kernel.h>
 
@@ -181,4 +182,4 @@ static inline int bitmap_intersects(const unsigned long *src1,
 		return __bitmap_intersects(src1, src2, nbits);
 }
 
-#endif /* _PERF_BITOPS_H */
+#endif /* _TOOLS_LINUX_BITMAP_H */
diff --git a/tools/include/linux/find.h b/tools/include/linux/find.h
new file mode 100644
index 000000000000..47e2bd6c5174
--- /dev/null
+++ b/tools/include/linux/find.h
@@ -0,0 +1,214 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _TOOLS_LINUX_FIND_H_
+#define _TOOLS_LINUX_FIND_H_
+
+#ifndef _TOOLS_LINUX_BITMAP_H
+#error tools: only <linux/bitmap.h> can be included directly
+#endif
+
+#include <linux/bitops.h>
+
+extern unsigned long _find_next_bit(const unsigned long *addr1,
+		const unsigned long *addr2, unsigned long nbits,
+		unsigned long start, unsigned long invert, unsigned long le);
+extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size);
+extern unsigned long _find_first_and_bit(const unsigned long *addr1,
+					 const unsigned long *addr2, unsigned long size);
+extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size);
+extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size);
+
+#ifndef find_next_bit
+/**
+ * find_next_bit - find the next set bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The bitmap size in bits
+ *
+ * Returns the bit number for the next set bit
+ * If no bits are set, returns @size.
+ */
+static inline
+unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
+			    unsigned long offset)
+{
+	if (small_const_nbits(size)) {
+		unsigned long val;
+
+		if (unlikely(offset >= size))
+			return size;
+
+		val = *addr & GENMASK(size - 1, offset);
+		return val ? __ffs(val) : size;
+	}
+
+	return _find_next_bit(addr, NULL, size, offset, 0UL, 0);
+}
+#endif
+
+#ifndef find_next_and_bit
+/**
+ * find_next_and_bit - find the next set bit in both memory regions
+ * @addr1: The first address to base the search on
+ * @addr2: The second address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The bitmap size in bits
+ *
+ * Returns the bit number for the next set bit
+ * If no bits are set, returns @size.
+ */
+static inline
+unsigned long find_next_and_bit(const unsigned long *addr1,
+		const unsigned long *addr2, unsigned long size,
+		unsigned long offset)
+{
+	if (small_const_nbits(size)) {
+		unsigned long val;
+
+		if (unlikely(offset >= size))
+			return size;
+
+		val = *addr1 & *addr2 & GENMASK(size - 1, offset);
+		return val ? __ffs(val) : size;
+	}
+
+	return _find_next_bit(addr1, addr2, size, offset, 0UL, 0);
+}
+#endif
+
+#ifndef find_next_zero_bit
+/**
+ * find_next_zero_bit - find the next cleared bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The bitmap size in bits
+ *
+ * Returns the bit number of the next zero bit
+ * If no bits are zero, returns @size.
+ */
+static inline
+unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
+				 unsigned long offset)
+{
+	if (small_const_nbits(size)) {
+		unsigned long val;
+
+		if (unlikely(offset >= size))
+			return size;
+
+		val = *addr | ~GENMASK(size - 1, offset);
+		return val == ~0UL ? size : ffz(val);
+	}
+
+	return _find_next_bit(addr, NULL, size, offset, ~0UL, 0);
+}
+#endif
+
+#ifndef find_first_bit
+/**
+ * find_first_bit - find the first set bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum number of bits to search
+ *
+ * Returns the bit number of the first set bit.
+ * If no bits are set, returns @size.
+ */
+static inline
+unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
+{
+	if (small_const_nbits(size)) {
+		unsigned long val = *addr & GENMASK(size - 1, 0);
+
+		return val ? __ffs(val) : size;
+	}
+
+	return _find_first_bit(addr, size);
+}
+#endif
+
+#ifndef find_first_and_bit
+/**
+ * find_first_and_bit - find the first set bit in both memory regions
+ * @addr1: The first address to base the search on
+ * @addr2: The second address to base the search on
+ * @size: The bitmap size in bits
+ *
+ * Returns the bit number for the next set bit
+ * If no bits are set, returns @size.
+ */
+static inline
+unsigned long find_first_and_bit(const unsigned long *addr1,
+				 const unsigned long *addr2,
+				 unsigned long size)
+{
+	if (small_const_nbits(size)) {
+		unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0);
+
+		return val ? __ffs(val) : size;
+	}
+
+	return _find_first_and_bit(addr1, addr2, size);
+}
+#endif
+
+#ifndef find_first_zero_bit
+/**
+ * find_first_zero_bit - find the first cleared bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum number of bits to search
+ *
+ * Returns the bit number of the first cleared bit.
+ * If no bits are zero, returns @size.
+ */
+static inline
+unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
+{
+	if (small_const_nbits(size)) {
+		unsigned long val = *addr | ~GENMASK(size - 1, 0);
+
+		return val == ~0UL ? size : ffz(val);
+	}
+
+	return _find_first_zero_bit(addr, size);
+}
+#endif
+
+#ifndef find_last_bit
+/**
+ * find_last_bit - find the last set bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The number of bits to search
+ *
+ * Returns the bit number of the last set bit, or size.
+ */
+static inline
+unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
+{
+	if (small_const_nbits(size)) {
+		unsigned long val = *addr & GENMASK(size - 1, 0);
+
+		return val ? __fls(val) : size;
+	}
+
+	return _find_last_bit(addr, size);
+}
+#endif
+
+/**
+ * find_next_clump8 - find next 8-bit clump with set bits in a memory region
+ * @clump: location to store copy of found clump
+ * @addr: address to base the search on
+ * @size: bitmap size in number of bits
+ * @offset: bit offset at which to start searching
+ *
+ * Returns the bit offset for the next set clump; the found clump value is
+ * copied to the location pointed by @clump. If no bits are set, returns @size.
+ */
+extern unsigned long find_next_clump8(unsigned long *clump,
+				      const unsigned long *addr,
+				      unsigned long size, unsigned long offset);
+
+#define find_first_clump8(clump, bits, size) \
+	find_next_clump8((clump), (bits), (size), 0)
+
+
+#endif /*__LINUX_FIND_H_ */
diff --git a/tools/lib/find_bit.c b/tools/lib/find_bit.c
index 109aa7ffcf97..ba4b8d94e004 100644
--- a/tools/lib/find_bit.c
+++ b/tools/lib/find_bit.c
@@ -96,6 +96,26 @@ unsigned long _find_first_bit(const unsigned long *addr, unsigned long size)
 }
 #endif
 
+#ifndef find_first_and_bit
+/*
+ * Find the first set bit in two memory regions.
+ */
+unsigned long _find_first_and_bit(const unsigned long *addr1,
+				  const unsigned long *addr2,
+				  unsigned long size)
+{
+	unsigned long idx, val;
+
+	for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
+		val = addr1[idx] & addr2[idx];
+		if (val)
+			return min(idx * BITS_PER_LONG + __ffs(val), size);
+	}
+
+	return size;
+}
+#endif
+
 #ifndef find_first_zero_bit
 /*
  * Find the first cleared bit in a memory region.
-- 
cgit v1.2.3


From d3e2bb4359f70c8b1d09a6f8e2f57240aab0da3f Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 14 Jan 2022 22:28:52 -0800
Subject: perf metric: Fix metric_leader

Multiple events may have a metric_leader to aggregate into.

This happens for uncore events where, for example, uncore_imc is
expanded into uncore_imc_0, uncore_imc_1, etc.

Such events all have the same metric_id and should aggregate into the
first event.

The change introducing metric_ids had a bug where the metric_id was
compared to itself, creating an always true condition.

Correct this by comparing the event in the metric_evlist and the
metric_leader.

Fixes: ec5c5b3d2c21b3f3 ("perf metric: Encode and use metric-id as qualifier")
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lore.kernel.org/lkml/20220115062852.1959424-1-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/metricgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
index 8826c555f780..d8492e339521 100644
--- a/tools/perf/util/metricgroup.c
+++ b/tools/perf/util/metricgroup.c
@@ -314,7 +314,7 @@ static int setup_metric_events(struct hashmap *ids,
 		 */
 		metric_id = evsel__metric_id(ev);
 		evlist__for_each_entry_continue(metric_evlist, ev) {
-			if (!strcmp(evsel__metric_id(metric_events[i]), metric_id))
+			if (!strcmp(evsel__metric_id(ev), metric_id))
 				ev->metric_leader = metric_events[i];
 		}
 	}
-- 
cgit v1.2.3


From 37be585807cb9a810f8395c39c4ee7bdbdc7b0dc Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 15 Jan 2022 17:11:10 -0300
Subject: perf cpumap: Add is_dummy() method

Needed to check if a cpu_map is dummy, i.e. not a cpu map at all, for
pid monitoring scenarios.

This probably needs to move to libperf, but since perf itself is the
first and so far only user, leave it at tools/perf/util/.

Acked-by: Andi Kleen <andi@firstfloor.org>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/cpumap.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index afc15027d678..0d3c2006a15d 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -2,6 +2,7 @@
 #ifndef __PERF_CPUMAP_H
 #define __PERF_CPUMAP_H
 
+#include <stdbool.h>
 #include <stdio.h>
 #include <stdbool.h>
 #include <internal/cpumap.h>
@@ -50,6 +51,15 @@ int cpu__setup_cpunode_map(void);
 int cpu__max_node(void);
 struct perf_cpu cpu__max_cpu(void);
 struct perf_cpu cpu__max_present_cpu(void);
+
+/**
+ * cpu_map__is_dummy - Events associated with a pid, rather than a CPU, use a single dummy map with an entry of -1.
+ */
+static inline bool cpu_map__is_dummy(struct perf_cpu_map *cpus)
+{
+	return cpus->nr == 1 && cpus->map[0].cpu == -1;
+}
+
 /**
  * cpu__get_node - Returns the numa node X as read from
  * /sys/devices/system/node/nodeX for the given CPU.
-- 
cgit v1.2.3


From 2eea0b56b0d6ace0172550477220a25d633ec5b9 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 15 Jan 2022 17:15:09 -0300
Subject: perf evlist: No need to do any affinity setup when profiling pids

The cpumap is dummy, so no need to go on figuring out affinity.o

This way we reduce the setup time for simple scenarios like:

	$ perf stat sleep 1

Acked-by: Andi Kleen <andi@firstfloor.org>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evlist.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 11eb95b2106b..6e88d404b5b3 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1290,7 +1290,7 @@ void evlist__close(struct evlist *evlist)
 	 * With perf record core.cpus is usually NULL.
 	 * Use the old method to handle this for now.
 	 */
-	if (!evlist->core.cpus) {
+	if (!evlist->core.cpus || cpu_map__is_dummy(evlist->core.cpus)) {
 		evlist__for_each_entry_reverse(evlist, evsel)
 			evsel__close(evsel);
 		return;
-- 
cgit v1.2.3


From 9bce13ea88f85344b765abe5d3dabdd0f44dc177 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Thu, 9 Dec 2021 21:04:25 +0100
Subject: perf record: Disable debuginfod by default

Fedora 35 sets DEBUGINFOD_URLS by default, which might lead to
unexpected stalls in perf record exit path, when we try to cache
profiled binaries.

  # DEBUGINFOD_PROGRESS=1 ./perf record -a
  ^C[ perf record: Woken up 1 times to write data ]
  Downloading from https://debuginfod.fedoraproject.org/ 447069
  Downloading from https://debuginfod.fedoraproject.org/ 1502175
  Downloading \^Z

Disabling DEBUGINFOD_URLS by default in perf record and adding
debuginfod option and .perfconfig variable support to enable id.

  Default without debuginfo processing:
  # perf record -a

  Using system debuginfod setup:
  # perf record -a --debuginfod

  Using custom debuginfd url:
  # perf record -a --debuginfod='https://evenbetterdebuginfodserver.krava'

Adding single perf_debuginfod_setup function and using
it also in perf buildid-cache command.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michael Petlan <mpetlan@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lore.kernel.org/lkml/20211209200425.303561-1-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-buildid-cache.txt |  5 ++++-
 tools/perf/Documentation/perf-config.txt        |  9 +++++++++
 tools/perf/Documentation/perf-record.txt        |  9 +++++++++
 tools/perf/builtin-buildid-cache.c              | 25 ++++++++++++++-----------
 tools/perf/builtin-record.c                     | 13 +++++++++++++
 tools/perf/util/util.c                          | 15 +++++++++++++++
 tools/perf/util/util.h                          |  6 ++++++
 7 files changed, 70 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf-buildid-cache.txt b/tools/perf/Documentation/perf-buildid-cache.txt
index cd8ce6e8ec12..7e44b419d301 100644
--- a/tools/perf/Documentation/perf-buildid-cache.txt
+++ b/tools/perf/Documentation/perf-buildid-cache.txt
@@ -74,12 +74,15 @@ OPTIONS
 	used when creating a uprobe for a process that resides in a
 	different mount namespace from the perf(1) utility.
 
---debuginfod=URLs::
+--debuginfod[=URLs]::
 	Specify debuginfod URL to be used when retrieving perf.data binaries,
 	it follows the same syntax as the DEBUGINFOD_URLS variable, like:
 
 	  buildid-cache.debuginfod=http://192.168.122.174:8002
 
+	If the URLs is not specified, the value of DEBUGINFOD_URLS
+	system environment variable is used.
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-buildid-list[1]
diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt
index 3bb75c1f25e8..0420e71698ee 100644
--- a/tools/perf/Documentation/perf-config.txt
+++ b/tools/perf/Documentation/perf-config.txt
@@ -587,6 +587,15 @@ record.*::
 		Use 'n' control blocks in asynchronous (Posix AIO) trace writing
 		mode ('n' default: 1, max: 4).
 
+	record.debuginfod::
+		Specify debuginfod URL to be used when cacheing perf.data binaries,
+		it follows the same syntax as the DEBUGINFOD_URLS variable, like:
+
+		  http://192.168.122.174:8002
+
+		If the URLs is 'system', the value of DEBUGINFOD_URLS system environment
+		variable is used.
+
 diff.*::
 	diff.order::
 		This option sets the number of columns to sort the result.
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 55df7b073a55..9ccc75935bc5 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -715,6 +715,15 @@ measurements:
 
 include::intel-hybrid.txt[]
 
+--debuginfod[=URLs]::
+	Specify debuginfod URL to be used when cacheing perf.data binaries,
+	it follows the same syntax as the DEBUGINFOD_URLS variable, like:
+
+	  http://192.168.122.174:8002
+
+	If the URLs is not specified, the value of DEBUGINFOD_URLS
+	system environment variable is used.
+
 SEE ALSO
 --------
 linkperf:perf-stat[1], linkperf:perf-list[1], linkperf:perf-intel-pt[1]
diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c
index 0db3cfc04c47..cd381693658b 100644
--- a/tools/perf/builtin-buildid-cache.c
+++ b/tools/perf/builtin-buildid-cache.c
@@ -351,10 +351,14 @@ static int build_id_cache__show_all(void)
 
 static int perf_buildid_cache_config(const char *var, const char *value, void *cb)
 {
-	const char **debuginfod = cb;
+	struct perf_debuginfod *di = cb;
 
-	if (!strcmp(var, "buildid-cache.debuginfod"))
-		*debuginfod = strdup(value);
+	if (!strcmp(var, "buildid-cache.debuginfod")) {
+		di->urls = strdup(value);
+		if (!di->urls)
+			return -ENOMEM;
+		di->set = true;
+	}
 
 	return 0;
 }
@@ -373,8 +377,8 @@ int cmd_buildid_cache(int argc, const char **argv)
 		   *purge_name_list_str = NULL,
 		   *missing_filename = NULL,
 		   *update_name_list_str = NULL,
-		   *kcore_filename = NULL,
-		   *debuginfod = NULL;
+		   *kcore_filename = NULL;
+	struct perf_debuginfod debuginfod = { };
 	char sbuf[STRERR_BUFSIZE];
 
 	struct perf_data data = {
@@ -399,8 +403,10 @@ int cmd_buildid_cache(int argc, const char **argv)
 	OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
 	OPT_STRING('u', "update", &update_name_list_str, "file list",
 		    "file(s) to update"),
-	OPT_STRING(0, "debuginfod", &debuginfod, "debuginfod url",
-		    "set debuginfod url"),
+	OPT_STRING_OPTARG_SET(0, "debuginfod", &debuginfod.urls,
+			&debuginfod.set, "debuginfod urls",
+			"Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
+			"system"),
 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
 	OPT_INTEGER(0, "target-ns", &ns_id, "target pid for namespace context"),
 	OPT_END()
@@ -425,10 +431,7 @@ int cmd_buildid_cache(int argc, const char **argv)
 	if (argc || !(list_files || opts_flag))
 		usage_with_options(buildid_cache_usage, buildid_cache_options);
 
-	if (debuginfod) {
-		pr_debug("DEBUGINFOD_URLS=%s\n", debuginfod);
-		setenv("DEBUGINFOD_URLS", debuginfod, 1);
-	}
+	perf_debuginfod_setup(&debuginfod);
 
 	/* -l is exclusive. It can not be used with other options. */
 	if (list_files && opts_flag) {
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 0a63295d30f0..bb716c953d02 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -111,6 +111,7 @@ struct record {
 	unsigned long long	samples;
 	struct mmap_cpu_mask	affinity_mask;
 	unsigned long		output_max_size;	/* = 0: unlimited */
+	struct perf_debuginfod	debuginfod;
 };
 
 static volatile int done;
@@ -2177,6 +2178,12 @@ static int perf_record_config(const char *var, const char *value, void *cb)
 			rec->opts.nr_cblocks = nr_cblocks_default;
 	}
 #endif
+	if (!strcmp(var, "record.debuginfod")) {
+		rec->debuginfod.urls = strdup(value);
+		if (!rec->debuginfod.urls)
+			return -ENOMEM;
+		rec->debuginfod.set = true;
+	}
 
 	return 0;
 }
@@ -2667,6 +2674,10 @@ static struct option __record_options[] = {
 		      parse_control_option),
 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
+	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
+			  &record.debuginfod.set, "debuginfod urls",
+			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
+			  "system"),
 	OPT_END()
 };
 
@@ -2720,6 +2731,8 @@ int cmd_record(int argc, const char **argv)
 	if (err)
 		return err;
 
+	perf_debuginfod_setup(&record.debuginfod);
+
 	/* Make system wide (-a) the default target. */
 	if (!argc && target__none(&rec->opts.target))
 		rec->opts.target.system_wide = true;
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index df3c4671be72..fb4f6616b5fa 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -416,3 +416,18 @@ char *perf_exe(char *buf, int len)
 	}
 	return strcpy(buf, "perf");
 }
+
+void perf_debuginfod_setup(struct perf_debuginfod *di)
+{
+	/*
+	 * By default '!di->set' we clear DEBUGINFOD_URLS, so debuginfod
+	 * processing is not triggered, otherwise we set it to 'di->urls'
+	 * value. If 'di->urls' is "system" we keep DEBUGINFOD_URLS value.
+	 */
+	if (!di->set)
+		setenv("DEBUGINFOD_URLS", "", 1);
+	else if (di->urls && strcmp(di->urls, "system"))
+		setenv("DEBUGINFOD_URLS", di->urls, 1);
+
+	pr_debug("DEBUGINFOD_URLS=%s\n", getenv("DEBUGINFOD_URLS"));
+}
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index 9443c29afa52..7b625cbd2dd8 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -71,4 +71,10 @@ void test_attr__init(void);
 struct perf_event_attr;
 void test_attr__open(struct perf_event_attr *attr, pid_t pid, struct perf_cpu cpu,
 		     int fd, int group_fd, unsigned long flags);
+
+struct perf_debuginfod {
+	const char	*urls;
+	bool		 set;
+};
+void perf_debuginfod_setup(struct perf_debuginfod *di);
 #endif /* GIT_COMPAT_UTIL_H */
-- 
cgit v1.2.3


From 9e6d484f9991176269607bb3c54a494e32eab27a Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Mon, 17 Jan 2022 16:05:41 +0100
Subject: KVM: selftests: Rename 'get_cpuid_test' to 'cpuid_test'

In preparation to reusing the existing 'get_cpuid_test' for testing
"KVM_SET_CPUID{,2} after KVM_RUN" rename it to 'cpuid_test' to avoid
the confusion.

No functional change intended.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Message-Id: <20220117150542.2176196-4-vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/.gitignore             |   2 +-
 tools/testing/selftests/kvm/Makefile               |   4 +-
 tools/testing/selftests/kvm/x86_64/cpuid_test.c    | 179 +++++++++++++++++++++
 .../testing/selftests/kvm/x86_64/get_cpuid_test.c  | 179 ---------------------
 4 files changed, 182 insertions(+), 182 deletions(-)
 create mode 100644 tools/testing/selftests/kvm/x86_64/cpuid_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86_64/get_cpuid_test.c

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index 8c129961accf..20b4c921c9a2 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -8,11 +8,11 @@
 /s390x/memop
 /s390x/resets
 /s390x/sync_regs_test
+/x86_64/cpuid_test
 /x86_64/cr4_cpuid_sync_test
 /x86_64/debug_regs
 /x86_64/evmcs_test
 /x86_64/emulator_error_test
-/x86_64/get_cpuid_test
 /x86_64/get_msr_index_features
 /x86_64/kvm_clock_test
 /x86_64/kvm_pv_test
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index ee8cf2149824..ec78a8692899 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -43,11 +43,11 @@ LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c lib/aarch64/handler
 LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_handler.c
 LIBKVM_riscv = lib/riscv/processor.c lib/riscv/ucall.c
 
-TEST_GEN_PROGS_x86_64 = x86_64/cr4_cpuid_sync_test
+TEST_GEN_PROGS_x86_64 = x86_64/cpuid_test
+TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test
 TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features
 TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test
 TEST_GEN_PROGS_x86_64 += x86_64/emulator_error_test
-TEST_GEN_PROGS_x86_64 += x86_64/get_cpuid_test
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features
diff --git a/tools/testing/selftests/kvm/x86_64/cpuid_test.c b/tools/testing/selftests/kvm/x86_64/cpuid_test.c
new file mode 100644
index 000000000000..a711f83749ea
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/cpuid_test.c
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021, Red Hat Inc.
+ *
+ * Generic tests for KVM CPUID set/get ioctls
+ */
+#include <asm/kvm_para.h>
+#include <linux/kvm_para.h>
+#include <stdint.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID 0
+
+/* CPUIDs known to differ */
+struct {
+	u32 function;
+	u32 index;
+} mangled_cpuids[] = {
+	/*
+	 * These entries depend on the vCPU's XCR0 register and IA32_XSS MSR,
+	 * which are not controlled for by this test.
+	 */
+	{.function = 0xd, .index = 0},
+	{.function = 0xd, .index = 1},
+};
+
+static void test_guest_cpuids(struct kvm_cpuid2 *guest_cpuid)
+{
+	int i;
+	u32 eax, ebx, ecx, edx;
+
+	for (i = 0; i < guest_cpuid->nent; i++) {
+		eax = guest_cpuid->entries[i].function;
+		ecx = guest_cpuid->entries[i].index;
+
+		cpuid(&eax, &ebx, &ecx, &edx);
+
+		GUEST_ASSERT(eax == guest_cpuid->entries[i].eax &&
+			     ebx == guest_cpuid->entries[i].ebx &&
+			     ecx == guest_cpuid->entries[i].ecx &&
+			     edx == guest_cpuid->entries[i].edx);
+	}
+
+}
+
+static void test_cpuid_40000000(struct kvm_cpuid2 *guest_cpuid)
+{
+	u32 eax = 0x40000000, ebx, ecx = 0, edx;
+
+	cpuid(&eax, &ebx, &ecx, &edx);
+
+	GUEST_ASSERT(eax == 0x40000001);
+}
+
+static void guest_main(struct kvm_cpuid2 *guest_cpuid)
+{
+	GUEST_SYNC(1);
+
+	test_guest_cpuids(guest_cpuid);
+
+	GUEST_SYNC(2);
+
+	test_cpuid_40000000(guest_cpuid);
+
+	GUEST_DONE();
+}
+
+static bool is_cpuid_mangled(struct kvm_cpuid_entry2 *entrie)
+{
+	int i;
+
+	for (i = 0; i < sizeof(mangled_cpuids); i++) {
+		if (mangled_cpuids[i].function == entrie->function &&
+		    mangled_cpuids[i].index == entrie->index)
+			return true;
+	}
+
+	return false;
+}
+
+static void check_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 *entrie)
+{
+	int i;
+
+	for (i = 0; i < cpuid->nent; i++) {
+		if (cpuid->entries[i].function == entrie->function &&
+		    cpuid->entries[i].index == entrie->index) {
+			if (is_cpuid_mangled(entrie))
+				return;
+
+			TEST_ASSERT(cpuid->entries[i].eax == entrie->eax &&
+				    cpuid->entries[i].ebx == entrie->ebx &&
+				    cpuid->entries[i].ecx == entrie->ecx &&
+				    cpuid->entries[i].edx == entrie->edx,
+				    "CPUID 0x%x.%x differ: 0x%x:0x%x:0x%x:0x%x vs 0x%x:0x%x:0x%x:0x%x",
+				    entrie->function, entrie->index,
+				    cpuid->entries[i].eax, cpuid->entries[i].ebx,
+				    cpuid->entries[i].ecx, cpuid->entries[i].edx,
+				    entrie->eax, entrie->ebx, entrie->ecx, entrie->edx);
+			return;
+		}
+	}
+
+	TEST_ASSERT(false, "CPUID 0x%x.%x not found", entrie->function, entrie->index);
+}
+
+static void compare_cpuids(struct kvm_cpuid2 *cpuid1, struct kvm_cpuid2 *cpuid2)
+{
+	int i;
+
+	for (i = 0; i < cpuid1->nent; i++)
+		check_cpuid(cpuid2, &cpuid1->entries[i]);
+
+	for (i = 0; i < cpuid2->nent; i++)
+		check_cpuid(cpuid1, &cpuid2->entries[i]);
+}
+
+static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid, int stage)
+{
+	struct ucall uc;
+
+	_vcpu_run(vm, vcpuid);
+
+	switch (get_ucall(vm, vcpuid, &uc)) {
+	case UCALL_SYNC:
+		TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+			    uc.args[1] == stage + 1,
+			    "Stage %d: Unexpected register values vmexit, got %lx",
+			    stage + 1, (ulong)uc.args[1]);
+		return;
+	case UCALL_DONE:
+		return;
+	case UCALL_ABORT:
+		TEST_ASSERT(false, "%s at %s:%ld\n\tvalues: %#lx, %#lx", (const char *)uc.args[0],
+			    __FILE__, uc.args[1], uc.args[2], uc.args[3]);
+	default:
+		TEST_ASSERT(false, "Unexpected exit: %s",
+			    exit_reason_str(vcpu_state(vm, vcpuid)->exit_reason));
+	}
+}
+
+struct kvm_cpuid2 *vcpu_alloc_cpuid(struct kvm_vm *vm, vm_vaddr_t *p_gva, struct kvm_cpuid2 *cpuid)
+{
+	int size = sizeof(*cpuid) + cpuid->nent * sizeof(cpuid->entries[0]);
+	vm_vaddr_t gva = vm_vaddr_alloc(vm, size, KVM_UTIL_MIN_VADDR);
+	struct kvm_cpuid2 *guest_cpuids = addr_gva2hva(vm, gva);
+
+	memcpy(guest_cpuids, cpuid, size);
+
+	*p_gva = gva;
+	return guest_cpuids;
+}
+
+int main(void)
+{
+	struct kvm_cpuid2 *supp_cpuid, *cpuid2;
+	vm_vaddr_t cpuid_gva;
+	struct kvm_vm *vm;
+	int stage;
+
+	vm = vm_create_default(VCPU_ID, 0, guest_main);
+
+	supp_cpuid = kvm_get_supported_cpuid();
+	cpuid2 = vcpu_get_cpuid(vm, VCPU_ID);
+
+	compare_cpuids(supp_cpuid, cpuid2);
+
+	vcpu_alloc_cpuid(vm, &cpuid_gva, cpuid2);
+
+	vcpu_args_set(vm, VCPU_ID, 1, cpuid_gva);
+
+	for (stage = 0; stage < 3; stage++)
+		run_vcpu(vm, VCPU_ID, stage);
+
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c b/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c
deleted file mode 100644
index a711f83749ea..000000000000
--- a/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c
+++ /dev/null
@@ -1,179 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2021, Red Hat Inc.
- *
- * Generic tests for KVM CPUID set/get ioctls
- */
-#include <asm/kvm_para.h>
-#include <linux/kvm_para.h>
-#include <stdint.h>
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-
-#define VCPU_ID 0
-
-/* CPUIDs known to differ */
-struct {
-	u32 function;
-	u32 index;
-} mangled_cpuids[] = {
-	/*
-	 * These entries depend on the vCPU's XCR0 register and IA32_XSS MSR,
-	 * which are not controlled for by this test.
-	 */
-	{.function = 0xd, .index = 0},
-	{.function = 0xd, .index = 1},
-};
-
-static void test_guest_cpuids(struct kvm_cpuid2 *guest_cpuid)
-{
-	int i;
-	u32 eax, ebx, ecx, edx;
-
-	for (i = 0; i < guest_cpuid->nent; i++) {
-		eax = guest_cpuid->entries[i].function;
-		ecx = guest_cpuid->entries[i].index;
-
-		cpuid(&eax, &ebx, &ecx, &edx);
-
-		GUEST_ASSERT(eax == guest_cpuid->entries[i].eax &&
-			     ebx == guest_cpuid->entries[i].ebx &&
-			     ecx == guest_cpuid->entries[i].ecx &&
-			     edx == guest_cpuid->entries[i].edx);
-	}
-
-}
-
-static void test_cpuid_40000000(struct kvm_cpuid2 *guest_cpuid)
-{
-	u32 eax = 0x40000000, ebx, ecx = 0, edx;
-
-	cpuid(&eax, &ebx, &ecx, &edx);
-
-	GUEST_ASSERT(eax == 0x40000001);
-}
-
-static void guest_main(struct kvm_cpuid2 *guest_cpuid)
-{
-	GUEST_SYNC(1);
-
-	test_guest_cpuids(guest_cpuid);
-
-	GUEST_SYNC(2);
-
-	test_cpuid_40000000(guest_cpuid);
-
-	GUEST_DONE();
-}
-
-static bool is_cpuid_mangled(struct kvm_cpuid_entry2 *entrie)
-{
-	int i;
-
-	for (i = 0; i < sizeof(mangled_cpuids); i++) {
-		if (mangled_cpuids[i].function == entrie->function &&
-		    mangled_cpuids[i].index == entrie->index)
-			return true;
-	}
-
-	return false;
-}
-
-static void check_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 *entrie)
-{
-	int i;
-
-	for (i = 0; i < cpuid->nent; i++) {
-		if (cpuid->entries[i].function == entrie->function &&
-		    cpuid->entries[i].index == entrie->index) {
-			if (is_cpuid_mangled(entrie))
-				return;
-
-			TEST_ASSERT(cpuid->entries[i].eax == entrie->eax &&
-				    cpuid->entries[i].ebx == entrie->ebx &&
-				    cpuid->entries[i].ecx == entrie->ecx &&
-				    cpuid->entries[i].edx == entrie->edx,
-				    "CPUID 0x%x.%x differ: 0x%x:0x%x:0x%x:0x%x vs 0x%x:0x%x:0x%x:0x%x",
-				    entrie->function, entrie->index,
-				    cpuid->entries[i].eax, cpuid->entries[i].ebx,
-				    cpuid->entries[i].ecx, cpuid->entries[i].edx,
-				    entrie->eax, entrie->ebx, entrie->ecx, entrie->edx);
-			return;
-		}
-	}
-
-	TEST_ASSERT(false, "CPUID 0x%x.%x not found", entrie->function, entrie->index);
-}
-
-static void compare_cpuids(struct kvm_cpuid2 *cpuid1, struct kvm_cpuid2 *cpuid2)
-{
-	int i;
-
-	for (i = 0; i < cpuid1->nent; i++)
-		check_cpuid(cpuid2, &cpuid1->entries[i]);
-
-	for (i = 0; i < cpuid2->nent; i++)
-		check_cpuid(cpuid1, &cpuid2->entries[i]);
-}
-
-static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid, int stage)
-{
-	struct ucall uc;
-
-	_vcpu_run(vm, vcpuid);
-
-	switch (get_ucall(vm, vcpuid, &uc)) {
-	case UCALL_SYNC:
-		TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
-			    uc.args[1] == stage + 1,
-			    "Stage %d: Unexpected register values vmexit, got %lx",
-			    stage + 1, (ulong)uc.args[1]);
-		return;
-	case UCALL_DONE:
-		return;
-	case UCALL_ABORT:
-		TEST_ASSERT(false, "%s at %s:%ld\n\tvalues: %#lx, %#lx", (const char *)uc.args[0],
-			    __FILE__, uc.args[1], uc.args[2], uc.args[3]);
-	default:
-		TEST_ASSERT(false, "Unexpected exit: %s",
-			    exit_reason_str(vcpu_state(vm, vcpuid)->exit_reason));
-	}
-}
-
-struct kvm_cpuid2 *vcpu_alloc_cpuid(struct kvm_vm *vm, vm_vaddr_t *p_gva, struct kvm_cpuid2 *cpuid)
-{
-	int size = sizeof(*cpuid) + cpuid->nent * sizeof(cpuid->entries[0]);
-	vm_vaddr_t gva = vm_vaddr_alloc(vm, size, KVM_UTIL_MIN_VADDR);
-	struct kvm_cpuid2 *guest_cpuids = addr_gva2hva(vm, gva);
-
-	memcpy(guest_cpuids, cpuid, size);
-
-	*p_gva = gva;
-	return guest_cpuids;
-}
-
-int main(void)
-{
-	struct kvm_cpuid2 *supp_cpuid, *cpuid2;
-	vm_vaddr_t cpuid_gva;
-	struct kvm_vm *vm;
-	int stage;
-
-	vm = vm_create_default(VCPU_ID, 0, guest_main);
-
-	supp_cpuid = kvm_get_supported_cpuid();
-	cpuid2 = vcpu_get_cpuid(vm, VCPU_ID);
-
-	compare_cpuids(supp_cpuid, cpuid2);
-
-	vcpu_alloc_cpuid(vm, &cpuid_gva, cpuid2);
-
-	vcpu_args_set(vm, VCPU_ID, 1, cpuid_gva);
-
-	for (stage = 0; stage < 3; stage++)
-		run_vcpu(vm, VCPU_ID, stage);
-
-	kvm_vm_free(vm);
-}
-- 
cgit v1.2.3


From ecebb966acaab2466d9857d1cc435ee1fc9eee50 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Mon, 17 Jan 2022 16:05:42 +0100
Subject: KVM: selftests: Test KVM_SET_CPUID2 after KVM_RUN

KVM forbids KVM_SET_CPUID2 after KVM_RUN was performed on a vCPU unless
the supplied CPUID data is equal to what was previously set. Test this.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Message-Id: <20220117150542.2176196-5-vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 .../selftests/kvm/include/x86_64/processor.h       |  7 +++++
 tools/testing/selftests/kvm/lib/x86_64/processor.c | 33 +++++++++++++++++++---
 tools/testing/selftests/kvm/x86_64/cpuid_test.c    | 30 ++++++++++++++++++++
 3 files changed, 66 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index e94ba0fc67d8..bb013d101c14 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -375,6 +375,8 @@ uint64_t kvm_get_feature_msr(uint64_t msr_index);
 struct kvm_cpuid2 *kvm_get_supported_cpuid(void);
 
 struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid);
+int __vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid,
+		     struct kvm_cpuid2 *cpuid);
 void vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid,
 		    struct kvm_cpuid2 *cpuid);
 
@@ -418,6 +420,11 @@ uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr);
 void vm_set_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr,
 			     uint64_t pte);
 
+/*
+ * get_cpuid() - find matching CPUID entry and return pointer to it.
+ */
+struct kvm_cpuid_entry2 *get_cpuid(struct kvm_cpuid2 *cpuid, uint32_t function,
+				   uint32_t index);
 /*
  * set_cpuid() - overwrites a matching cpuid entry with the provided value.
  *		 matches based on ent->function && ent->index. returns true
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index babb0f28575c..d61e2326dc85 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -886,6 +886,17 @@ kvm_get_supported_cpuid_index(uint32_t function, uint32_t index)
 	return entry;
 }
 
+
+int __vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid,
+		     struct kvm_cpuid2 *cpuid)
+{
+	struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+
+	TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+	return ioctl(vcpu->fd, KVM_SET_CPUID2, cpuid);
+}
+
 /*
  * VM VCPU CPUID Set
  *
@@ -903,12 +914,9 @@ kvm_get_supported_cpuid_index(uint32_t function, uint32_t index)
 void vcpu_set_cpuid(struct kvm_vm *vm,
 		uint32_t vcpuid, struct kvm_cpuid2 *cpuid)
 {
-	struct vcpu *vcpu = vcpu_find(vm, vcpuid);
 	int rc;
 
-	TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
-
-	rc = ioctl(vcpu->fd, KVM_SET_CPUID2, cpuid);
+	rc = __vcpu_set_cpuid(vm, vcpuid, cpuid);
 	TEST_ASSERT(rc == 0, "KVM_SET_CPUID2 failed, rc: %i errno: %i",
 		    rc, errno);
 
@@ -1384,6 +1392,23 @@ void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid)
 	}
 }
 
+struct kvm_cpuid_entry2 *get_cpuid(struct kvm_cpuid2 *cpuid, uint32_t function,
+				   uint32_t index)
+{
+	int i;
+
+	for (i = 0; i < cpuid->nent; i++) {
+		struct kvm_cpuid_entry2 *cur = &cpuid->entries[i];
+
+		if (cur->function == function && cur->index == index)
+			return cur;
+	}
+
+	TEST_FAIL("CPUID function 0x%x index 0x%x not found ", function, index);
+
+	return NULL;
+}
+
 bool set_cpuid(struct kvm_cpuid2 *cpuid,
 	       struct kvm_cpuid_entry2 *ent)
 {
diff --git a/tools/testing/selftests/kvm/x86_64/cpuid_test.c b/tools/testing/selftests/kvm/x86_64/cpuid_test.c
index a711f83749ea..16d2465c5634 100644
--- a/tools/testing/selftests/kvm/x86_64/cpuid_test.c
+++ b/tools/testing/selftests/kvm/x86_64/cpuid_test.c
@@ -154,6 +154,34 @@ struct kvm_cpuid2 *vcpu_alloc_cpuid(struct kvm_vm *vm, vm_vaddr_t *p_gva, struct
 	return guest_cpuids;
 }
 
+static void set_cpuid_after_run(struct kvm_vm *vm, struct kvm_cpuid2 *cpuid)
+{
+	struct kvm_cpuid_entry2 *ent;
+	int rc;
+	u32 eax, ebx, x;
+
+	/* Setting unmodified CPUID is allowed */
+	rc = __vcpu_set_cpuid(vm, VCPU_ID, cpuid);
+	TEST_ASSERT(!rc, "Setting unmodified CPUID after KVM_RUN failed: %d", rc);
+
+	/* Changing CPU features is forbidden */
+	ent = get_cpuid(cpuid, 0x7, 0);
+	ebx = ent->ebx;
+	ent->ebx--;
+	rc = __vcpu_set_cpuid(vm, VCPU_ID, cpuid);
+	TEST_ASSERT(rc, "Changing CPU features should fail");
+	ent->ebx = ebx;
+
+	/* Changing MAXPHYADDR is forbidden */
+	ent = get_cpuid(cpuid, 0x80000008, 0);
+	eax = ent->eax;
+	x = eax & 0xff;
+	ent->eax = (eax & ~0xffu) | (x - 1);
+	rc = __vcpu_set_cpuid(vm, VCPU_ID, cpuid);
+	TEST_ASSERT(rc, "Changing MAXPHYADDR should fail");
+	ent->eax = eax;
+}
+
 int main(void)
 {
 	struct kvm_cpuid2 *supp_cpuid, *cpuid2;
@@ -175,5 +203,7 @@ int main(void)
 	for (stage = 0; stage < 3; stage++)
 		run_vcpu(vm, VCPU_ID, stage);
 
+	set_cpuid_after_run(vm, cpuid2);
+
 	kvm_vm_free(vm);
 }
-- 
cgit v1.2.3


From 4624f199327a704dd1069aca1c3cadb8f2a28c6f Mon Sep 17 00:00:00 2001
From: Zechuan Chen <chenzechuan1@huawei.com>
Date: Tue, 28 Dec 2021 19:13:38 +0800
Subject: perf probe: Fix ppc64 'perf probe add events failed' case

Because of commit bf794bf52a80c627 ("powerpc/kprobes: Fix kallsyms
lookup across powerpc ABIv1 and ABIv2"), in ppc64 ABIv1, our perf
command eliminates the need to use the prefix "." at the symbol name.

But when the command "perf probe -a schedule" is executed on ppc64
ABIv1, it obtains two symbol address information through /proc/kallsyms,
for example:

  cat /proc/kallsyms | grep -w schedule
  c000000000657020 T .schedule
  c000000000d4fdb8 D schedule

The symbol "D schedule" is not a function symbol, and perf will print:
"p:probe/schedule _text+13958584"Failed to write event: Invalid argument

Therefore, when searching symbols from map and adding probe point for
them, a symbol type check is added. If the type of symbol is not a
function, skip it.

Fixes: bf794bf52a80c627 ("powerpc/kprobes: Fix kallsyms lookup across powerpc ABIv1 and ABIv2")
Signed-off-by: Zechuan Chen <chenzechuan1@huawei.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jianlin Lv <Jianlin.Lv@arm.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Cc: Yang Jihong <yangjihong1@huawei.com>
Link: https://lore.kernel.org/r/20211228111338.218602-1-chenzechuan1@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/probe-event.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index b2a02c9ab8ea..a834918a0a0d 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -3083,6 +3083,9 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
 	for (j = 0; j < num_matched_functions; j++) {
 		sym = syms[j];
 
+		if (sym->type != STT_FUNC)
+			continue;
+
 		/* There can be duplicated symbols in the map */
 		for (i = 0; i < j; i++)
 			if (sym->start == syms[i]->start) {
-- 
cgit v1.2.3


From 1855b796f2f672cbb25400be2d3171c26fc869a3 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 17 Jan 2022 13:09:28 -0300
Subject: perf affinity: Allow passing a NULL arg to affinity__cleanup()

Just like with free(), NULL is checked to avoid having all callers do
it.

Its convenient for when not using affinity setup/cleanup for dummy CPU
maps, i.e. CPU maps for pid targets.

Acked-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20220117160931.1191712-2-acme@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/affinity.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/affinity.c b/tools/perf/util/affinity.c
index f1e30d566db3..4d216c0dc425 100644
--- a/tools/perf/util/affinity.c
+++ b/tools/perf/util/affinity.c
@@ -62,7 +62,7 @@ void affinity__set(struct affinity *a, int cpu)
 	clear_bit(cpu, a->sched_cpus);
 }
 
-void affinity__cleanup(struct affinity *a)
+static void __affinity__cleanup(struct affinity *a)
 {
 	int cpu_set_size = get_cpu_set_size();
 
@@ -71,3 +71,9 @@ void affinity__cleanup(struct affinity *a)
 	zfree(&a->sched_cpus);
 	zfree(&a->orig_cpus);
 }
+
+void affinity__cleanup(struct affinity *a)
+{
+	if (a != NULL)
+		__affinity__cleanup(a);
+}
-- 
cgit v1.2.3


From 49de179577e7b05b57f625bf05cdc60a72de38d0 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 17 Jan 2022 13:09:29 -0300
Subject: perf stat: No need to setup affinities when starting a workload

I.e. the simple:

  $ perf stat sleep 1

Uses a dummy CPU map and thus there is no need to setup/cleanup
affinities to avoid IPIs, etc.

With this we're down to a sched_getaffinity() call, in the libnuma
initialization, that probably can be removed in a followup patch.

Acked-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20220117160931.1191712-3-acme@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 973ade18b72a..934e992c966f 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -788,7 +788,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 	const bool forks = (argc > 0);
 	bool is_pipe = STAT_RECORD ? perf_stat.data.is_pipe : false;
 	struct evlist_cpu_iterator evlist_cpu_itr;
-	struct affinity affinity;
+	struct affinity saved_affinity, *affinity = NULL;
 	int err;
 	bool second_pass = false;
 
@@ -803,8 +803,11 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 	if (group)
 		evlist__set_leader(evsel_list);
 
-	if (affinity__setup(&affinity) < 0)
-		return -1;
+	if (!cpu_map__is_dummy(evsel_list->core.cpus)) {
+		if (affinity__setup(&saved_affinity) < 0)
+			return -1;
+		affinity = &saved_affinity;
+	}
 
 	evlist__for_each_entry(evsel_list, counter) {
 		if (bpf_counter__load(counter, &target))
@@ -813,7 +816,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 			all_counters_use_bpf = false;
 	}
 
-	evlist__for_each_cpu(evlist_cpu_itr, evsel_list, &affinity) {
+	evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) {
 		counter = evlist_cpu_itr.evsel;
 
 		/*
@@ -869,7 +872,7 @@ try_again:
 		 */
 
 		/* First close errored or weak retry */
-		evlist__for_each_cpu(evlist_cpu_itr, evsel_list, &affinity) {
+		evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) {
 			counter = evlist_cpu_itr.evsel;
 
 			if (!counter->reset_group && !counter->errored)
@@ -878,7 +881,7 @@ try_again:
 			perf_evsel__close_cpu(&counter->core, evlist_cpu_itr.cpu_map_idx);
 		}
 		/* Now reopen weak */
-		evlist__for_each_cpu(evlist_cpu_itr, evsel_list, &affinity) {
+		evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) {
 			counter = evlist_cpu_itr.evsel;
 
 			if (!counter->reset_group && !counter->errored)
@@ -904,7 +907,7 @@ try_again_reset:
 			counter->supported = true;
 		}
 	}
-	affinity__cleanup(&affinity);
+	affinity__cleanup(affinity);
 
 	evlist__for_each_entry(evsel_list, counter) {
 		if (!counter->supported) {
-- 
cgit v1.2.3


From f350ee95498a3fa65c37ed597d9c051c6b2b6974 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 17 Jan 2022 13:09:30 -0300
Subject: perf evlist: No need to setup affinities when enabling events for pid
 targets

When the target is a pid, not started by 'perf stat' we need to enable
the events, and in that case there is no need to setup affinities as we
use a dummy CPU map, with just one entry set to -1.

So stop doing it to avoid this needless call to sched_getaffinity():

  # strace -ke sched_getaffinity perf stat -e cycles -p 241957 sleep 1
  <SNIP>
  sched_getaffinity(0, 512, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]) = 8
   > /usr/lib64/libc-2.33.so(sched_getaffinity@@GLIBC_2.3.4+0x1a) [0xe6eea]
   > /var/home/acme/bin/perf(affinity__setup+0x6a) [0x5329ca]
   > /var/home/acme/bin/perf(__evlist__enable.constprop.0+0x23) [0x4b9693]
   > /var/home/acme/bin/perf(enable_counters+0x14d) [0x42de5d]
   > /var/home/acme/bin/perf(cmd_stat+0x2358) [0x4310c8]
   > /var/home/acme/bin/perf(run_builtin+0x6a) [0x4a2cfa]
   > /var/home/acme/bin/perf(main+0x612) [0x40f8c2]
   > /usr/lib64/libc-2.33.so(__libc_start_main+0xd4) [0x27b74]
   > /var/home/acme/bin/perf(_start+0x2d) [0x40fadd]
  <SNIP>

Acked-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20220117160931.1191712-4-acme@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evlist.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 6e88d404b5b3..ae6d4363da76 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -487,12 +487,16 @@ static void __evlist__enable(struct evlist *evlist, char *evsel_name)
 {
 	struct evsel *pos;
 	struct evlist_cpu_iterator evlist_cpu_itr;
-	struct affinity affinity;
+	struct affinity saved_affinity, *affinity = NULL;
 
-	if (affinity__setup(&affinity) < 0)
-		return;
+	// See explanation in evlist__close()
+	if (!cpu_map__is_dummy(evlist->core.cpus)) {
+		if (affinity__setup(&saved_affinity) < 0)
+			return;
+		affinity = &saved_affinity;
+	}
 
-	evlist__for_each_cpu(evlist_cpu_itr, evlist, &affinity) {
+	evlist__for_each_cpu(evlist_cpu_itr, evlist, affinity) {
 		pos = evlist_cpu_itr.evsel;
 		if (evsel__strcmp(pos, evsel_name))
 			continue;
@@ -500,7 +504,7 @@ static void __evlist__enable(struct evlist *evlist, char *evsel_name)
 			continue;
 		evsel__enable_cpu(pos, evlist_cpu_itr.cpu_map_idx);
 	}
-	affinity__cleanup(&affinity);
+	affinity__cleanup(affinity);
 	evlist__for_each_entry(evlist, pos) {
 		if (evsel__strcmp(pos, evsel_name))
 			continue;
-- 
cgit v1.2.3


From 0d3d237651fd7a01fe5dc501b0d170a43d8156ba Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 17 Jan 2022 13:09:31 -0300
Subject: perf evlist: No need to setup affinities when disabling events for
 pid targets

When the target is a pid, not started by 'perf stat' we need to disable
the events, and in that case there is no need to setup affinities as we
use a dummy CPU map, with just one entry set to -1.

So stop doing it to avoid this needless call to sched_getaffinity():

  # strace -ke sched_getaffinity perf stat -e cycles -p 241957 sleep 1
  <SNIP>
  sched_getaffinity(0, 512, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]) = 8
   > /usr/lib64/libc-2.33.so(sched_getaffinity@@GLIBC_2.3.4+0x1a) [0xe6eea]
   > /var/home/acme/bin/perf(affinity__setup+0x6a) [0x532a2a]
   > /var/home/acme/bin/perf(__evlist__disable.constprop.0+0x27) [0x4b9827]
   > /var/home/acme/bin/perf(cmd_stat+0x29b5) [0x431725]
   > /var/home/acme/bin/perf(run_builtin+0x6a) [0x4a2cfa]
   > /var/home/acme/bin/perf(main+0x612) [0x40f8c2]
   > /usr/lib64/libc-2.33.so(__libc_start_main+0xd4) [0x27b74]
   > /var/home/acme/bin/perf(_start+0x2d) [0x40fadd]
  <SNIP>

Acked-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20220117160931.1191712-5-acme@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evlist.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index ae6d4363da76..eaad04e1672a 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -430,15 +430,19 @@ static void __evlist__disable(struct evlist *evlist, char *evsel_name)
 {
 	struct evsel *pos;
 	struct evlist_cpu_iterator evlist_cpu_itr;
-	struct affinity affinity;
+	struct affinity saved_affinity, *affinity = NULL;
 	bool has_imm = false;
 
-	if (affinity__setup(&affinity) < 0)
-		return;
+	// See explanation in evlist__close()
+	if (!cpu_map__is_dummy(evlist->core.cpus)) {
+		if (affinity__setup(&saved_affinity) < 0)
+			return;
+		affinity = &saved_affinity;
+	}
 
 	/* Disable 'immediate' events last */
 	for (int imm = 0; imm <= 1; imm++) {
-		evlist__for_each_cpu(evlist_cpu_itr, evlist, &affinity) {
+		evlist__for_each_cpu(evlist_cpu_itr, evlist, affinity) {
 			pos = evlist_cpu_itr.evsel;
 			if (evsel__strcmp(pos, evsel_name))
 				continue;
@@ -454,7 +458,7 @@ static void __evlist__disable(struct evlist *evlist, char *evsel_name)
 			break;
 	}
 
-	affinity__cleanup(&affinity);
+	affinity__cleanup(affinity);
 	evlist__for_each_entry(evlist, pos) {
 		if (evsel__strcmp(pos, evsel_name))
 			continue;
-- 
cgit v1.2.3


From 722e4db3ae0d52b2e3801280afbe19cf2d188e91 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 10 Jan 2022 13:44:18 +0000
Subject: bpf, selftests: Add various ringbuf tests with invalid offset

Assert that the verifier is rejecting invalid offsets on the ringbuf entries:

  # ./test_verifier | grep ring
  #947/u ringbuf: invalid reservation offset 1 OK
  #947/p ringbuf: invalid reservation offset 1 OK
  #948/u ringbuf: invalid reservation offset 2 OK
  #948/p ringbuf: invalid reservation offset 2 OK

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/verifier/ringbuf.c | 64 ++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/verifier/ringbuf.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/verifier/ringbuf.c b/tools/testing/selftests/bpf/verifier/ringbuf.c
new file mode 100644
index 000000000000..68cae6947cc4
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/ringbuf.c
@@ -0,0 +1,64 @@
+{
+	"ringbuf: invalid reservation offset 1",
+	.insns = {
+	/* reserve 8 byte ringbuf memory */
+	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+	BPF_LD_MAP_FD(BPF_REG_1, 0),
+	BPF_MOV64_IMM(BPF_REG_2, 8),
+	BPF_MOV64_IMM(BPF_REG_3, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve),
+	/* store a pointer to the reserved memory in R6 */
+	BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+	/* check whether the reservation was successful */
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
+	/* spill R6(mem) into the stack */
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -8),
+	/* fill it back in R7 */
+	BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, -8),
+	/* should be able to access *(R7) = 0 */
+	BPF_ST_MEM(BPF_DW, BPF_REG_7, 0, 0),
+	/* submit the reserved ringbuf memory */
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+	/* add invalid offset to reserved ringbuf memory */
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0xcafe),
+	BPF_MOV64_IMM(BPF_REG_2, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_submit),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_map_ringbuf = { 1 },
+	.result = REJECT,
+	.errstr = "dereference of modified mem ptr R1",
+},
+{
+	"ringbuf: invalid reservation offset 2",
+	.insns = {
+	/* reserve 8 byte ringbuf memory */
+	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+	BPF_LD_MAP_FD(BPF_REG_1, 0),
+	BPF_MOV64_IMM(BPF_REG_2, 8),
+	BPF_MOV64_IMM(BPF_REG_3, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve),
+	/* store a pointer to the reserved memory in R6 */
+	BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+	/* check whether the reservation was successful */
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
+	/* spill R6(mem) into the stack */
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -8),
+	/* fill it back in R7 */
+	BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, -8),
+	/* add invalid offset to reserved ringbuf memory */
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 0xcafe),
+	/* should be able to access *(R7) = 0 */
+	BPF_ST_MEM(BPF_DW, BPF_REG_7, 0, 0),
+	/* submit the reserved ringbuf memory */
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+	BPF_MOV64_IMM(BPF_REG_2, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_submit),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_map_ringbuf = { 1 },
+	.result = REJECT,
+	.errstr = "R7 min value is outside of the allowed memory range",
+},
-- 
cgit v1.2.3


From 37c8d4807d1b8b521b30310dce97f6695dc2c2c6 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 12 Jan 2022 12:39:48 +0000
Subject: bpf, selftests: Add ringbuf memory type confusion test

Add two tests, one which asserts that ring buffer memory can be passed to
other helpers for populating its entry area, and another one where verifier
rejects different type of memory passed to bpf_ringbuf_submit().

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/d_path.c    | 14 +++++++++
 .../selftests/bpf/progs/test_d_path_check_types.c  | 32 +++++++++++++++++++++
 tools/testing/selftests/bpf/verifier/ringbuf.c     | 33 +++++++++++++++++++++-
 tools/testing/selftests/bpf/verifier/spill_fill.c  |  2 +-
 4 files changed, 79 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/progs/test_d_path_check_types.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/d_path.c b/tools/testing/selftests/bpf/prog_tests/d_path.c
index 32fc5b3b5cf6..911345c526e6 100644
--- a/tools/testing/selftests/bpf/prog_tests/d_path.c
+++ b/tools/testing/selftests/bpf/prog_tests/d_path.c
@@ -10,6 +10,7 @@
 
 #include "test_d_path.skel.h"
 #include "test_d_path_check_rdonly_mem.skel.h"
+#include "test_d_path_check_types.skel.h"
 
 static int duration;
 
@@ -167,6 +168,16 @@ static void test_d_path_check_rdonly_mem(void)
 	test_d_path_check_rdonly_mem__destroy(skel);
 }
 
+static void test_d_path_check_types(void)
+{
+	struct test_d_path_check_types *skel;
+
+	skel = test_d_path_check_types__open_and_load();
+	ASSERT_ERR_PTR(skel, "unexpected_load_passing_wrong_type");
+
+	test_d_path_check_types__destroy(skel);
+}
+
 void test_d_path(void)
 {
 	if (test__start_subtest("basic"))
@@ -174,4 +185,7 @@ void test_d_path(void)
 
 	if (test__start_subtest("check_rdonly_mem"))
 		test_d_path_check_rdonly_mem();
+
+	if (test__start_subtest("check_alloc_mem"))
+		test_d_path_check_types();
 }
diff --git a/tools/testing/selftests/bpf/progs/test_d_path_check_types.c b/tools/testing/selftests/bpf/progs/test_d_path_check_types.c
new file mode 100644
index 000000000000..7e02b7361307
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_d_path_check_types.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+extern const int bpf_prog_active __ksym;
+
+struct {
+	__uint(type, BPF_MAP_TYPE_RINGBUF);
+	__uint(max_entries, 1 << 12);
+} ringbuf SEC(".maps");
+
+SEC("fentry/security_inode_getattr")
+int BPF_PROG(d_path_check_rdonly_mem, struct path *path, struct kstat *stat,
+	     __u32 request_mask, unsigned int query_flags)
+{
+	void *active;
+	u32 cpu;
+
+	cpu = bpf_get_smp_processor_id();
+	active = (void *)bpf_per_cpu_ptr(&bpf_prog_active, cpu);
+	if (active) {
+		/* FAIL here! 'active' points to 'regular' memory. It
+		 * cannot be submitted to ring buffer.
+		 */
+		bpf_ringbuf_submit(active, 0);
+	}
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/verifier/ringbuf.c b/tools/testing/selftests/bpf/verifier/ringbuf.c
index 68cae6947cc4..b64d33e4833c 100644
--- a/tools/testing/selftests/bpf/verifier/ringbuf.c
+++ b/tools/testing/selftests/bpf/verifier/ringbuf.c
@@ -28,7 +28,7 @@
 	},
 	.fixup_map_ringbuf = { 1 },
 	.result = REJECT,
-	.errstr = "dereference of modified mem ptr R1",
+	.errstr = "dereference of modified alloc_mem ptr R1",
 },
 {
 	"ringbuf: invalid reservation offset 2",
@@ -62,3 +62,34 @@
 	.result = REJECT,
 	.errstr = "R7 min value is outside of the allowed memory range",
 },
+{
+	"ringbuf: check passing rb mem to helpers",
+	.insns = {
+	BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+	/* reserve 8 byte ringbuf memory */
+	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+	BPF_LD_MAP_FD(BPF_REG_1, 0),
+	BPF_MOV64_IMM(BPF_REG_2, 8),
+	BPF_MOV64_IMM(BPF_REG_3, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve),
+	BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+	/* check whether the reservation was successful */
+	BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+	BPF_EXIT_INSN(),
+	/* pass allocated ring buffer memory to fib lookup */
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+	BPF_MOV64_IMM(BPF_REG_3, 8),
+	BPF_MOV64_IMM(BPF_REG_4, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_fib_lookup),
+	/* submit the ringbuf memory */
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+	BPF_MOV64_IMM(BPF_REG_2, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_submit),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_map_ringbuf = { 2 },
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.result = ACCEPT,
+},
diff --git a/tools/testing/selftests/bpf/verifier/spill_fill.c b/tools/testing/selftests/bpf/verifier/spill_fill.c
index 1a8eb9672bd1..8cfc5349d2a8 100644
--- a/tools/testing/selftests/bpf/verifier/spill_fill.c
+++ b/tools/testing/selftests/bpf/verifier/spill_fill.c
@@ -84,7 +84,7 @@
 	},
 	.fixup_map_ringbuf = { 1 },
 	.result = REJECT,
-	.errstr = "R0 pointer arithmetic on mem_or_null prohibited",
+	.errstr = "R0 pointer arithmetic on alloc_mem_or_null prohibited",
 },
 {
 	"check corrupted spill/fill",
-- 
cgit v1.2.3


From 9d5f0c36438eeae7566ca383b2b673179e3cc613 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 18 Jan 2022 09:02:43 -0300
Subject: perf machine: Use path__join() to compose a path instead of
 snprintf(dir, '/', filename)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Its more intention revealing, and if we're interested in the odd cases
where this may end up truncating we can do debug checks at one
centralized place.

Motivation, of all the container builds, fedora rawhide started
complaining of:

  util/machine.c: In function ‘machine__create_modules’:
  util/machine.c:1419:50: error: ‘%s’ directive output may be truncated writing up to 255 bytes into a region of size between 0 and 4095 [-Werror=format-truncation=]
   1419 |                 snprintf(path, sizeof(path), "%s/%s", dir_name, dent->d_name);
        |                                                  ^~
  In file included from /usr/include/stdio.h:894,
                   from util/branch.h:9,
                   from util/callchain.h:8,
                   from util/machine.c:7:
  In function ‘snprintf’,
      inlined from ‘maps__set_modules_path_dir’ at util/machine.c:1419:3,
      inlined from ‘machine__set_modules_path’ at util/machine.c:1473:9,
      inlined from ‘machine__create_modules’ at util/machine.c:1519:7:
  /usr/include/bits/stdio2.h:71:10: note: ‘__builtin___snprintf_chk’ output between 2 and 4352 bytes into a destination of size 4096

There are other places where we should use path__join(), but lets get rid of
this one first.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Acked-by: Ian Rogers <irogers@google.com>
Link: Link: https://lore.kernel.org/r/YebZKjwgfdOz0lAs@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/machine.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 3901440aeff9..f70ba56912d4 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -16,6 +16,7 @@
 #include "map_symbol.h"
 #include "branch.h"
 #include "mem-events.h"
+#include "path.h"
 #include "srcline.h"
 #include "symbol.h"
 #include "sort.h"
@@ -1416,7 +1417,7 @@ static int maps__set_modules_path_dir(struct maps *maps, const char *dir_name, i
 		struct stat st;
 
 		/*sshfs might return bad dent->d_type, so we have to stat*/
-		snprintf(path, sizeof(path), "%s/%s", dir_name, dent->d_name);
+		path__join(path, sizeof(path), dir_name, dent->d_name);
 		if (stat(path, &st))
 			continue;
 
-- 
cgit v1.2.3


From 1a1d1dbce6d5477e2bb08ce1ef0d77caa838cc8e Mon Sep 17 00:00:00 2001
From: Wei Wang <wei.w.wang@intel.com>
Date: Mon, 17 Jan 2022 20:48:17 -0500
Subject: kvm: selftests: conditionally build vm_xsave_req_perm()

vm_xsave_req_perm() is currently defined and used by x86_64 only.
Make it compiled into vm_create_with_vcpus() only when on x86_64
machines. Otherwise, it would cause linkage errors, e.g. on s390x.

Fixes: 415a3c33e8 ("kvm: selftests: Add support for KVM_CAP_XSAVE2")
Reported-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Tested-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Message-Id: <20220118014817.30910-1-wei.w.wang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/lib/kvm_util.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 4a645dc77f34..c22a17aac6b0 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -393,10 +393,12 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus,
 	struct kvm_vm *vm;
 	int i;
 
+#ifdef __x86_64__
 	/*
 	 * Permission needs to be requested before KVM_SET_CPUID2.
 	 */
 	vm_xsave_req_perm();
+#endif
 
 	/* Force slot0 memory size not small than DEFAULT_GUEST_PHY_PAGES */
 	if (slot0_mem_pages < DEFAULT_GUEST_PHY_PAGES)
-- 
cgit v1.2.3


From b33b9c407861985713ca18cc9ea05b7540210ad4 Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Fri, 14 Jan 2022 21:24:27 -0800
Subject: selftests: kvm/x86: Parameterize the CPUID vendor string check

Refactor is_intel_cpu() to make it easier to reuse the bulk of the
code for other vendors in the future.

Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220115052431.447232-3-jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/lib/x86_64/processor.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index d61e2326dc85..308a1fe687f1 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -1253,10 +1253,10 @@ void kvm_x86_state_cleanup(struct kvm_x86_state *state)
 	free(state);
 }
 
-bool is_intel_cpu(void)
+static bool cpu_vendor_string_is(const char *vendor)
 {
+	const uint32_t *chunk = (const uint32_t *)vendor;
 	int eax, ebx, ecx, edx;
-	const uint32_t *chunk;
 	const int leaf = 0;
 
 	__asm__ __volatile__(
@@ -1265,10 +1265,14 @@ bool is_intel_cpu(void)
 		  "=c"(ecx), "=d"(edx)
 		: /* input */ "0"(leaf), "2"(0));
 
-	chunk = (const uint32_t *)("GenuineIntel");
 	return (ebx == chunk[0] && edx == chunk[1] && ecx == chunk[2]);
 }
 
+bool is_intel_cpu(void)
+{
+	return cpu_vendor_string_is("GenuineIntel");
+}
+
 uint32_t kvm_get_cpuid_max_basic(void)
 {
 	return kvm_get_supported_cpuid_entry(0)->eax;
-- 
cgit v1.2.3


From 21066101f42cfd86fdd835b70ce0e36c335f5f4d Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Fri, 14 Jan 2022 21:24:28 -0800
Subject: selftests: kvm/x86: Introduce is_amd_cpu()

Replace the one ad hoc "AuthenticAMD" CPUID vendor string comparison
with a new function, is_amd_cpu().

Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220115052431.447232-4-jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/include/x86_64/processor.h |  1 +
 tools/testing/selftests/kvm/lib/x86_64/processor.c     | 18 +++++++++---------
 2 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index bb013d101c14..e82c44aa029a 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -364,6 +364,7 @@ static inline unsigned long get_xmm(int n)
 }
 
 bool is_intel_cpu(void);
+bool is_amd_cpu(void);
 
 struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid);
 void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid,
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index 308a1fe687f1..6bf01ded3776 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -1273,6 +1273,14 @@ bool is_intel_cpu(void)
 	return cpu_vendor_string_is("GenuineIntel");
 }
 
+/*
+ * Exclude early K5 samples with a vendor string of "AMDisbetter!"
+ */
+bool is_amd_cpu(void)
+{
+	return cpu_vendor_string_is("AuthenticAMD");
+}
+
 uint32_t kvm_get_cpuid_max_basic(void)
 {
 	return kvm_get_supported_cpuid_entry(0)->eax;
@@ -1508,10 +1516,6 @@ struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpui
 	return cpuid;
 }
 
-#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541
-#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163
-#define X86EMUL_CPUID_VENDOR_AuthenticAMD_edx 0x69746e65
-
 static inline unsigned x86_family(unsigned int eax)
 {
         unsigned int x86;
@@ -1533,11 +1537,7 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
 	max_gfn = (1ULL << (vm->pa_bits - vm->page_shift)) - 1;
 
 	/* Avoid reserved HyperTransport region on AMD processors.  */
-	eax = ecx = 0;
-	cpuid(&eax, &ebx, &ecx, &edx);
-	if (ebx != X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx ||
-	    ecx != X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx ||
-	    edx != X86EMUL_CPUID_VENDOR_AuthenticAMD_edx)
+	if (!is_amd_cpu())
 		return max_gfn;
 
 	/* On parts with <40 physical address bits, the area is fully hidden */
-- 
cgit v1.2.3


From 398f9240f90f4168f5882180723f743f7b682049 Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Fri, 14 Jan 2022 21:24:29 -0800
Subject: selftests: kvm/x86: Export x86_family() for use outside of
 processor.c

Move this static inline function to processor.h, so that it can be
used in individual tests, as needed.

Opportunistically replace the bare 'unsigned' with 'unsigned int.'

Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220115052431.447232-5-jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/include/x86_64/processor.h | 12 ++++++++++++
 tools/testing/selftests/kvm/lib/x86_64/processor.c     | 12 ------------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index e82c44aa029a..4eb73959ce05 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -366,6 +366,18 @@ static inline unsigned long get_xmm(int n)
 bool is_intel_cpu(void);
 bool is_amd_cpu(void);
 
+static inline unsigned int x86_family(unsigned int eax)
+{
+        unsigned int x86;
+
+        x86 = (eax >> 8) & 0xf;
+
+        if (x86 == 0xf)
+                x86 += (eax >> 20) & 0xff;
+
+        return x86;
+}
+
 struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid);
 void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid,
 		     struct kvm_x86_state *state);
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index 6bf01ded3776..59dcfe1967cc 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -1516,18 +1516,6 @@ struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpui
 	return cpuid;
 }
 
-static inline unsigned x86_family(unsigned int eax)
-{
-        unsigned int x86;
-
-        x86 = (eax >> 8) & 0xf;
-
-        if (x86 == 0xf)
-                x86 += (eax >> 20) & 0xff;
-
-        return x86;
-}
-
 unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
 {
 	const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */
-- 
cgit v1.2.3


From 2ba9047424fc7243c63ac57f5fdfa754aa895e3c Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Fri, 14 Jan 2022 21:24:30 -0800
Subject: selftests: kvm/x86: Introduce x86_model()

Extract the x86 model number from CPUID.01H:EAX.

Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220115052431.447232-6-jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/include/x86_64/processor.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index 4eb73959ce05..122447827954 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -378,6 +378,11 @@ static inline unsigned int x86_family(unsigned int eax)
         return x86;
 }
 
+static inline unsigned int x86_model(unsigned int eax)
+{
+	return ((eax >> 12) & 0xf0) | ((eax >> 4) & 0x0f);
+}
+
 struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid);
 void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid,
 		     struct kvm_x86_state *state);
-- 
cgit v1.2.3


From bef9a701f3ebfb60da259b04778d24128505a96c Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Fri, 14 Jan 2022 21:24:31 -0800
Subject: selftests: kvm/x86: Add test for KVM_SET_PMU_EVENT_FILTER

Verify that the PMU event filter works as expected.

Note that the virtual PMU doesn't work as expected on AMD Zen CPUs (an
intercepted rdmsr is counted as a retired branch instruction), but the
PMU event filter does work.

Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220115052431.447232-7-jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/.gitignore             |   1 +
 tools/testing/selftests/kvm/Makefile               |   1 +
 .../selftests/kvm/x86_64/pmu_event_filter_test.c   | 438 +++++++++++++++++++++
 3 files changed, 440 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index 20b4c921c9a2..9fe19f412f44 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -22,6 +22,7 @@
 /x86_64/mmio_warning_test
 /x86_64/mmu_role_test
 /x86_64/platform_info_test
+/x86_64/pmu_event_filter_test
 /x86_64/set_boot_cpu_id
 /x86_64/set_sregs_test
 /x86_64/sev_migrate_tests
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index ec78a8692899..7fbc80e3ecdf 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -56,6 +56,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test
 TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test
 TEST_GEN_PROGS_x86_64 += x86_64/mmu_role_test
 TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
+TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test
 TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id
 TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
 TEST_GEN_PROGS_x86_64 += x86_64/smm_test
diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
new file mode 100644
index 000000000000..aa104946e6e0
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
@@ -0,0 +1,438 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test for x86 KVM_SET_PMU_EVENT_FILTER.
+ *
+ * Copyright (C) 2022, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * Verifies the expected behavior of allow lists and deny lists for
+ * virtual PMU events.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+/*
+ * In lieu of copying perf_event.h into tools...
+ */
+#define ARCH_PERFMON_EVENTSEL_OS			(1ULL << 17)
+#define ARCH_PERFMON_EVENTSEL_ENABLE			(1ULL << 22)
+
+union cpuid10_eax {
+	struct {
+		unsigned int version_id:8;
+		unsigned int num_counters:8;
+		unsigned int bit_width:8;
+		unsigned int mask_length:8;
+	} split;
+	unsigned int full;
+};
+
+union cpuid10_ebx {
+	struct {
+		unsigned int no_unhalted_core_cycles:1;
+		unsigned int no_instructions_retired:1;
+		unsigned int no_unhalted_reference_cycles:1;
+		unsigned int no_llc_reference:1;
+		unsigned int no_llc_misses:1;
+		unsigned int no_branch_instruction_retired:1;
+		unsigned int no_branch_misses_retired:1;
+	} split;
+	unsigned int full;
+};
+
+/* End of stuff taken from perf_event.h. */
+
+/* Oddly, this isn't in perf_event.h. */
+#define ARCH_PERFMON_BRANCHES_RETIRED		5
+
+#define VCPU_ID 0
+#define NUM_BRANCHES 42
+
+/*
+ * This is how the event selector and unit mask are stored in an AMD
+ * core performance event-select register. Intel's format is similar,
+ * but the event selector is only 8 bits.
+ */
+#define EVENT(select, umask) ((select & 0xf00UL) << 24 | (select & 0xff) | \
+			      (umask & 0xff) << 8)
+
+/*
+ * "Branch instructions retired", from the Intel SDM, volume 3,
+ * "Pre-defined Architectural Performance Events."
+ */
+
+#define INTEL_BR_RETIRED EVENT(0xc4, 0)
+
+/*
+ * "Retired branch instructions", from Processor Programming Reference
+ * (PPR) for AMD Family 17h Model 01h, Revision B1 Processors,
+ * Preliminary Processor Programming Reference (PPR) for AMD Family
+ * 17h Model 31h, Revision B0 Processors, and Preliminary Processor
+ * Programming Reference (PPR) for AMD Family 19h Model 01h, Revision
+ * B1 Processors Volume 1 of 2.
+ */
+
+#define AMD_ZEN_BR_RETIRED EVENT(0xc2, 0)
+
+/*
+ * This event list comprises Intel's eight architectural events plus
+ * AMD's "retired branch instructions" for Zen[123] (and possibly
+ * other AMD CPUs).
+ */
+static const uint64_t event_list[] = {
+	EVENT(0x3c, 0),
+	EVENT(0xc0, 0),
+	EVENT(0x3c, 1),
+	EVENT(0x2e, 0x4f),
+	EVENT(0x2e, 0x41),
+	EVENT(0xc4, 0),
+	EVENT(0xc5, 0),
+	EVENT(0xa4, 1),
+	AMD_ZEN_BR_RETIRED,
+};
+
+/*
+ * If we encounter a #GP during the guest PMU sanity check, then the guest
+ * PMU is not functional. Inform the hypervisor via GUEST_SYNC(0).
+ */
+static void guest_gp_handler(struct ex_regs *regs)
+{
+	GUEST_SYNC(0);
+}
+
+/*
+ * Check that we can write a new value to the given MSR and read it back.
+ * The caller should provide a non-empty set of bits that are safe to flip.
+ *
+ * Return on success. GUEST_SYNC(0) on error.
+ */
+static void check_msr(uint32_t msr, uint64_t bits_to_flip)
+{
+	uint64_t v = rdmsr(msr) ^ bits_to_flip;
+
+	wrmsr(msr, v);
+	if (rdmsr(msr) != v)
+		GUEST_SYNC(0);
+
+	v ^= bits_to_flip;
+	wrmsr(msr, v);
+	if (rdmsr(msr) != v)
+		GUEST_SYNC(0);
+}
+
+static void intel_guest_code(void)
+{
+	check_msr(MSR_CORE_PERF_GLOBAL_CTRL, 1);
+	check_msr(MSR_P6_EVNTSEL0, 0xffff);
+	check_msr(MSR_IA32_PMC0, 0xffff);
+	GUEST_SYNC(1);
+
+	for (;;) {
+		uint64_t br0, br1;
+
+		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+		wrmsr(MSR_P6_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE |
+		      ARCH_PERFMON_EVENTSEL_OS | INTEL_BR_RETIRED);
+		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 1);
+		br0 = rdmsr(MSR_IA32_PMC0);
+		__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
+		br1 = rdmsr(MSR_IA32_PMC0);
+		GUEST_SYNC(br1 - br0);
+	}
+}
+
+/*
+ * To avoid needing a check for CPUID.80000001:ECX.PerfCtrExtCore[bit 23],
+ * this code uses the always-available, legacy K7 PMU MSRs, which alias to
+ * the first four of the six extended core PMU MSRs.
+ */
+static void amd_guest_code(void)
+{
+	check_msr(MSR_K7_EVNTSEL0, 0xffff);
+	check_msr(MSR_K7_PERFCTR0, 0xffff);
+	GUEST_SYNC(1);
+
+	for (;;) {
+		uint64_t br0, br1;
+
+		wrmsr(MSR_K7_EVNTSEL0, 0);
+		wrmsr(MSR_K7_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE |
+		      ARCH_PERFMON_EVENTSEL_OS | AMD_ZEN_BR_RETIRED);
+		br0 = rdmsr(MSR_K7_PERFCTR0);
+		__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
+		br1 = rdmsr(MSR_K7_PERFCTR0);
+		GUEST_SYNC(br1 - br0);
+	}
+}
+
+/*
+ * Run the VM to the next GUEST_SYNC(value), and return the value passed
+ * to the sync. Any other exit from the guest is fatal.
+ */
+static uint64_t run_vm_to_sync(struct kvm_vm *vm)
+{
+	struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+	struct ucall uc;
+
+	vcpu_run(vm, VCPU_ID);
+	TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+		    "Exit_reason other than KVM_EXIT_IO: %u (%s)\n",
+		    run->exit_reason,
+		    exit_reason_str(run->exit_reason));
+	get_ucall(vm, VCPU_ID, &uc);
+	TEST_ASSERT(uc.cmd == UCALL_SYNC,
+		    "Received ucall other than UCALL_SYNC: %lu", uc.cmd);
+	return uc.args[1];
+}
+
+/*
+ * In a nested environment or if the vPMU is disabled, the guest PMU
+ * might not work as architected (accessing the PMU MSRs may raise
+ * #GP, or writes could simply be discarded). In those situations,
+ * there is no point in running these tests. The guest code will perform
+ * a sanity check and then GUEST_SYNC(success). In the case of failure,
+ * the behavior of the guest on resumption is undefined.
+ */
+static bool sanity_check_pmu(struct kvm_vm *vm)
+{
+	bool success;
+
+	vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler);
+	success = run_vm_to_sync(vm);
+	vm_install_exception_handler(vm, GP_VECTOR, NULL);
+
+	return success;
+}
+
+static struct kvm_pmu_event_filter *make_pmu_event_filter(uint32_t nevents)
+{
+	struct kvm_pmu_event_filter *f;
+	int size = sizeof(*f) + nevents * sizeof(f->events[0]);
+
+	f = malloc(size);
+	TEST_ASSERT(f, "Out of memory");
+	memset(f, 0, size);
+	f->nevents = nevents;
+	return f;
+}
+
+static struct kvm_pmu_event_filter *event_filter(uint32_t action)
+{
+	struct kvm_pmu_event_filter *f;
+	int i;
+
+	f = make_pmu_event_filter(ARRAY_SIZE(event_list));
+	f->action = action;
+	for (i = 0; i < ARRAY_SIZE(event_list); i++)
+		f->events[i] = event_list[i];
+
+	return f;
+}
+
+/*
+ * Remove the first occurrence of 'event' (if any) from the filter's
+ * event list.
+ */
+static struct kvm_pmu_event_filter *remove_event(struct kvm_pmu_event_filter *f,
+						 uint64_t event)
+{
+	bool found = false;
+	int i;
+
+	for (i = 0; i < f->nevents; i++) {
+		if (found)
+			f->events[i - 1] = f->events[i];
+		else
+			found = f->events[i] == event;
+	}
+	if (found)
+		f->nevents--;
+	return f;
+}
+
+static void test_without_filter(struct kvm_vm *vm)
+{
+	uint64_t count = run_vm_to_sync(vm);
+
+	if (count != NUM_BRANCHES)
+		pr_info("%s: Branch instructions retired = %lu (expected %u)\n",
+			__func__, count, NUM_BRANCHES);
+	TEST_ASSERT(count, "Allowed PMU event is not counting");
+}
+
+static uint64_t test_with_filter(struct kvm_vm *vm,
+				 struct kvm_pmu_event_filter *f)
+{
+	vm_ioctl(vm, KVM_SET_PMU_EVENT_FILTER, (void *)f);
+	return run_vm_to_sync(vm);
+}
+
+static void test_member_deny_list(struct kvm_vm *vm)
+{
+	struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_DENY);
+	uint64_t count = test_with_filter(vm, f);
+
+	free(f);
+	if (count)
+		pr_info("%s: Branch instructions retired = %lu (expected 0)\n",
+			__func__, count);
+	TEST_ASSERT(!count, "Disallowed PMU Event is counting");
+}
+
+static void test_member_allow_list(struct kvm_vm *vm)
+{
+	struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_ALLOW);
+	uint64_t count = test_with_filter(vm, f);
+
+	free(f);
+	if (count != NUM_BRANCHES)
+		pr_info("%s: Branch instructions retired = %lu (expected %u)\n",
+			__func__, count, NUM_BRANCHES);
+	TEST_ASSERT(count, "Allowed PMU event is not counting");
+}
+
+static void test_not_member_deny_list(struct kvm_vm *vm)
+{
+	struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_DENY);
+	uint64_t count;
+
+	remove_event(f, INTEL_BR_RETIRED);
+	remove_event(f, AMD_ZEN_BR_RETIRED);
+	count = test_with_filter(vm, f);
+	free(f);
+	if (count != NUM_BRANCHES)
+		pr_info("%s: Branch instructions retired = %lu (expected %u)\n",
+			__func__, count, NUM_BRANCHES);
+	TEST_ASSERT(count, "Allowed PMU event is not counting");
+}
+
+static void test_not_member_allow_list(struct kvm_vm *vm)
+{
+	struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_ALLOW);
+	uint64_t count;
+
+	remove_event(f, INTEL_BR_RETIRED);
+	remove_event(f, AMD_ZEN_BR_RETIRED);
+	count = test_with_filter(vm, f);
+	free(f);
+	if (count)
+		pr_info("%s: Branch instructions retired = %lu (expected 0)\n",
+			__func__, count);
+	TEST_ASSERT(!count, "Disallowed PMU Event is counting");
+}
+
+/*
+ * Check for a non-zero PMU version, at least one general-purpose
+ * counter per logical processor, an EBX bit vector of length greater
+ * than 5, and EBX[5] clear.
+ */
+static bool check_intel_pmu_leaf(struct kvm_cpuid_entry2 *entry)
+{
+	union cpuid10_eax eax = { .full = entry->eax };
+	union cpuid10_ebx ebx = { .full = entry->ebx };
+
+	return eax.split.version_id && eax.split.num_counters > 0 &&
+		eax.split.mask_length > ARCH_PERFMON_BRANCHES_RETIRED &&
+		!ebx.split.no_branch_instruction_retired;
+}
+
+/*
+ * Note that CPUID leaf 0xa is Intel-specific. This leaf should be
+ * clear on AMD hardware.
+ */
+static bool use_intel_pmu(void)
+{
+	struct kvm_cpuid_entry2 *entry;
+	struct kvm_cpuid2 *cpuid;
+
+	cpuid = kvm_get_supported_cpuid();
+	entry = kvm_get_supported_cpuid_index(0xa, 0);
+	return is_intel_cpu() && entry && check_intel_pmu_leaf(entry);
+}
+
+static bool is_zen1(uint32_t eax)
+{
+	return x86_family(eax) == 0x17 && x86_model(eax) <= 0x0f;
+}
+
+static bool is_zen2(uint32_t eax)
+{
+	return x86_family(eax) == 0x17 &&
+		x86_model(eax) >= 0x30 && x86_model(eax) <= 0x3f;
+}
+
+static bool is_zen3(uint32_t eax)
+{
+	return x86_family(eax) == 0x19 && x86_model(eax) <= 0x0f;
+}
+
+/*
+ * Determining AMD support for a PMU event requires consulting the AMD
+ * PPR for the CPU or reference material derived therefrom. The AMD
+ * test code herein has been verified to work on Zen1, Zen2, and Zen3.
+ *
+ * Feel free to add more AMD CPUs that are documented to support event
+ * select 0xc2 umask 0 as "retired branch instructions."
+ */
+static bool use_amd_pmu(void)
+{
+	struct kvm_cpuid_entry2 *entry;
+	struct kvm_cpuid2 *cpuid;
+
+	cpuid = kvm_get_supported_cpuid();
+	entry = kvm_get_supported_cpuid_index(1, 0);
+	return is_amd_cpu() && entry &&
+		(is_zen1(entry->eax) ||
+		 is_zen2(entry->eax) ||
+		 is_zen3(entry->eax));
+}
+
+int main(int argc, char *argv[])
+{
+	void (*guest_code)(void) = NULL;
+	struct kvm_vm *vm;
+	int r;
+
+	/* Tell stdout not to buffer its content */
+	setbuf(stdout, NULL);
+
+	r = kvm_check_cap(KVM_CAP_PMU_EVENT_FILTER);
+	if (!r) {
+		print_skip("KVM_CAP_PMU_EVENT_FILTER not supported");
+		exit(KSFT_SKIP);
+	}
+
+	if (use_intel_pmu())
+		guest_code = intel_guest_code;
+	else if (use_amd_pmu())
+		guest_code = amd_guest_code;
+
+	if (!guest_code) {
+		print_skip("Don't know how to test this guest PMU");
+		exit(KSFT_SKIP);
+	}
+
+	vm = vm_create_default(VCPU_ID, 0, guest_code);
+
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(vm, VCPU_ID);
+
+	if (!sanity_check_pmu(vm)) {
+		print_skip("Guest PMU is not functional");
+		exit(KSFT_SKIP);
+	}
+
+	test_without_filter(vm);
+	test_member_deny_list(vm);
+	test_member_allow_list(vm);
+	test_not_member_deny_list(vm);
+	test_not_member_allow_list(vm);
+
+	kvm_vm_free(vm);
+
+	return 0;
+}
-- 
cgit v1.2.3


From e337f7e063641ca4d040c8210d4bd790b81effb0 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 28 Dec 2021 23:24:37 +0000
Subject: KVM: selftests: Add a test to force emulation with a pending
 exception

Add a VMX specific test to verify that KVM doesn't explode if userspace
attempts KVM_RUN when emulation is required with a pending exception.
KVM VMX's emulation support for !unrestricted_guest punts exceptions to
userspace instead of attempting to synthesize the exception with all the
correct state (and stack switching, etc...).

Punting is acceptable as there's never been a request to support
injecting exceptions when emulating due to invalid state, but KVM has
historically assumed that userspace will do the right thing and either
clear the exception or kill the guest.  Deliberately do the opposite and
attempt to re-enter the guest with a pending exception and emulation
required to verify KVM continues to punt the combination to userspace,
e.g. doesn't explode, WARN, etc...

Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20211228232437.1875318-3-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/.gitignore             |   1 +
 tools/testing/selftests/kvm/Makefile               |   1 +
 .../vmx_exception_with_invalid_guest_state.c       | 139 +++++++++++++++++++++
 3 files changed, 141 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index 9fe19f412f44..c2165b137a50 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -37,6 +37,7 @@
 /x86_64/vmx_apic_access_test
 /x86_64/vmx_close_while_nested_test
 /x86_64/vmx_dirty_log_test
+/x86_64/vmx_exception_with_invalid_guest_state
 /x86_64/vmx_invalid_nested_guest_state
 /x86_64/vmx_preemption_timer_test
 /x86_64/vmx_set_nested_state_test
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 7fbc80e3ecdf..81ebf99d6ff0 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -70,6 +70,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_exception_with_invalid_guest_state
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_invalid_nested_guest_state
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c
new file mode 100644
index 000000000000..27a850f3d7ce
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#include <signal.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+
+#include "kselftest.h"
+
+#define VCPU_ID	0
+
+static struct kvm_vm *vm;
+
+static void guest_ud_handler(struct ex_regs *regs)
+{
+	/* Loop on the ud2 until guest state is made invalid. */
+}
+
+static void guest_code(void)
+{
+	asm volatile("ud2");
+}
+
+static void __run_vcpu_with_invalid_state(void)
+{
+	struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+
+	vcpu_run(vm, VCPU_ID);
+
+	TEST_ASSERT(run->exit_reason == KVM_EXIT_INTERNAL_ERROR,
+		    "Expected KVM_EXIT_INTERNAL_ERROR, got %d (%s)\n",
+		    run->exit_reason, exit_reason_str(run->exit_reason));
+	TEST_ASSERT(run->emulation_failure.suberror == KVM_INTERNAL_ERROR_EMULATION,
+		    "Expected emulation failure, got %d\n",
+		    run->emulation_failure.suberror);
+}
+
+static void run_vcpu_with_invalid_state(void)
+{
+	/*
+	 * Always run twice to verify KVM handles the case where _KVM_ queues
+	 * an exception with invalid state and then exits to userspace, i.e.
+	 * that KVM doesn't explode if userspace ignores the initial error.
+	 */
+	__run_vcpu_with_invalid_state();
+	__run_vcpu_with_invalid_state();
+}
+
+static void set_timer(void)
+{
+	struct itimerval timer;
+
+	timer.it_value.tv_sec  = 0;
+	timer.it_value.tv_usec = 200;
+	timer.it_interval = timer.it_value;
+	ASSERT_EQ(setitimer(ITIMER_REAL, &timer, NULL), 0);
+}
+
+static void set_or_clear_invalid_guest_state(bool set)
+{
+	static struct kvm_sregs sregs;
+
+	if (!sregs.cr0)
+		vcpu_sregs_get(vm, VCPU_ID, &sregs);
+	sregs.tr.unusable = !!set;
+	vcpu_sregs_set(vm, VCPU_ID, &sregs);
+}
+
+static void set_invalid_guest_state(void)
+{
+	set_or_clear_invalid_guest_state(true);
+}
+
+static void clear_invalid_guest_state(void)
+{
+	set_or_clear_invalid_guest_state(false);
+}
+
+static void sigalrm_handler(int sig)
+{
+	struct kvm_vcpu_events events;
+
+	TEST_ASSERT(sig == SIGALRM, "Unexpected signal = %d", sig);
+
+	vcpu_events_get(vm, VCPU_ID, &events);
+
+	/*
+	 * If an exception is pending, attempt KVM_RUN with invalid guest,
+	 * otherwise rearm the timer and keep doing so until the timer fires
+	 * between KVM queueing an exception and re-entering the guest.
+	 */
+	if (events.exception.pending) {
+		set_invalid_guest_state();
+		run_vcpu_with_invalid_state();
+	} else {
+		set_timer();
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	if (!is_intel_cpu() || vm_is_unrestricted_guest(NULL)) {
+		print_skip("Must be run with kvm_intel.unrestricted_guest=0");
+		exit(KSFT_SKIP);
+	}
+
+	vm = vm_create_default(VCPU_ID, 0, (void *)guest_code);
+
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(vm, VCPU_ID);
+
+	vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler);
+
+	/*
+	 * Stuff invalid guest state for L2 by making TR unusuable.  The next
+	 * KVM_RUN should induce a TRIPLE_FAULT in L2 as KVM doesn't support
+	 * emulating invalid guest state for L2.
+	 */
+	set_invalid_guest_state();
+	run_vcpu_with_invalid_state();
+
+	/*
+	 * Verify KVM also handles the case where userspace gains control while
+	 * an exception is pending and stuffs invalid state.  Run with valid
+	 * guest state and a timer firing every 200us, and attempt to enter the
+	 * guest with invalid state when the handler interrupts KVM with an
+	 * exception pending.
+	 */
+	clear_invalid_guest_state();
+	TEST_ASSERT(signal(SIGALRM, sigalrm_handler) != SIG_ERR,
+		    "Failed to register SIGALRM handler, errno = %d (%s)",
+		    errno, strerror(errno));
+
+	set_timer();
+	run_vcpu_with_invalid_state();
+}
-- 
cgit v1.2.3


From d5ad5b1c04c85f01850e88231cad7dfbc9e1d30c Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Tue, 18 Jan 2022 17:20:52 +0500
Subject: selftests: kvm: add amx_test to .gitignore

amx_test's binary should be present in the .gitignore file for the git
to ignore it.

Fixes: bf70636d9443 ("selftest: kvm: Add amx selftest")
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Message-Id: <20220118122053.1941915-1-usama.anjum@collabora.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/.gitignore | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index c2165b137a50..dce7de7755e6 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -8,6 +8,7 @@
 /s390x/memop
 /s390x/resets
 /s390x/sync_regs_test
+/x86_64/amx_test
 /x86_64/cpuid_test
 /x86_64/cr4_cpuid_sync_test
 /x86_64/debug_regs
-- 
cgit v1.2.3


From 8326c79d10be2ddbfd3d3804206949a71cb15675 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 13 Nov 2021 10:43:52 -0300
Subject: tools headers UAPI: Sync x86 arch prctl headers with the kernel
 sources

To pick the changes in this cset:

  980fe2fddcff2193 ("x86/fpu: Extend fpu_xstate_prctl() with guest permissions")

This picks these new prctls:

  $ tools/perf/trace/beauty/x86_arch_prctl.sh > /tmp/before
  $ cp arch/x86/include/uapi/asm/prctl.h tools/arch/x86/include/uapi/asm/prctl.h
  $ tools/perf/trace/beauty/x86_arch_prctl.sh > /tmp/after
  $ diff -u /tmp/before /tmp/after
  --- /tmp/before	2022-01-19 14:40:05.049394977 -0300
  +++ /tmp/after	2022-01-19 14:40:35.628154565 -0300
  @@ -9,6 +9,8 @@
   	[0x1021 - 0x1001]= "GET_XCOMP_SUPP",
   	[0x1022 - 0x1001]= "GET_XCOMP_PERM",
   	[0x1023 - 0x1001]= "REQ_XCOMP_PERM",
  +	[0x1024 - 0x1001]= "GET_XCOMP_GUEST_PERM",
  +	[0x1025 - 0x1001]= "REQ_XCOMP_GUEST_PERM",
   };

   #define x86_arch_prctl_codes_2_offset 0x2001
  $

With this 'perf trace' can translate those numbers into strings and use
the strings in filter expressions:

  # perf trace -e prctl
       0.000 ( 0.011 ms): DOM Worker/3722622 prctl(option: SET_NAME, arg2: 0x7f9c014b7df5)     = 0
       0.032 ( 0.002 ms): DOM Worker/3722622 prctl(option: SET_NAME, arg2: 0x7f9bb6b51580)     = 0
       5.452 ( 0.003 ms): StreamT~ns #30/3722623 prctl(option: SET_NAME, arg2: 0x7f9bdbdfeb70) = 0
       5.468 ( 0.002 ms): StreamT~ns #30/3722623 prctl(option: SET_NAME, arg2: 0x7f9bdbdfea70) = 0
      24.494 ( 0.009 ms): IndexedDB #556/3722624 prctl(option: SET_NAME, arg2: 0x7f562a32ae28) = 0
      24.540 ( 0.002 ms): IndexedDB #556/3722624 prctl(option: SET_NAME, arg2: 0x7f563c6d4b30) = 0
     670.281 ( 0.008 ms): systemd-userwo/3722339 prctl(option: SET_NAME, arg2: 0x564be30805c8) = 0
     670.293 ( 0.002 ms): systemd-userwo/3722339 prctl(option: SET_NAME, arg2: 0x564be30800f0) = 0
  ^C#

This addresses these perf build warnings:

  Warning: Kernel ABI header at 'tools/arch/x86/include/uapi/asm/prctl.h' differs from latest version at 'arch/x86/include/uapi/asm/prctl.h'
  diff -u tools/arch/x86/include/uapi/asm/prctl.h arch/x86/include/uapi/asm/prctl.h

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/arch/x86/include/uapi/asm/prctl.h | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/arch/x86/include/uapi/asm/prctl.h b/tools/arch/x86/include/uapi/asm/prctl.h
index 754a07856817..500b96e71f18 100644
--- a/tools/arch/x86/include/uapi/asm/prctl.h
+++ b/tools/arch/x86/include/uapi/asm/prctl.h
@@ -2,20 +2,22 @@
 #ifndef _ASM_X86_PRCTL_H
 #define _ASM_X86_PRCTL_H
 
-#define ARCH_SET_GS		0x1001
-#define ARCH_SET_FS		0x1002
-#define ARCH_GET_FS		0x1003
-#define ARCH_GET_GS		0x1004
+#define ARCH_SET_GS			0x1001
+#define ARCH_SET_FS			0x1002
+#define ARCH_GET_FS			0x1003
+#define ARCH_GET_GS			0x1004
 
-#define ARCH_GET_CPUID		0x1011
-#define ARCH_SET_CPUID		0x1012
+#define ARCH_GET_CPUID			0x1011
+#define ARCH_SET_CPUID			0x1012
 
-#define ARCH_GET_XCOMP_SUPP	0x1021
-#define ARCH_GET_XCOMP_PERM	0x1022
-#define ARCH_REQ_XCOMP_PERM	0x1023
+#define ARCH_GET_XCOMP_SUPP		0x1021
+#define ARCH_GET_XCOMP_PERM		0x1022
+#define ARCH_REQ_XCOMP_PERM		0x1023
+#define ARCH_GET_XCOMP_GUEST_PERM	0x1024
+#define ARCH_REQ_XCOMP_GUEST_PERM	0x1025
 
-#define ARCH_MAP_VDSO_X32	0x2001
-#define ARCH_MAP_VDSO_32	0x2002
-#define ARCH_MAP_VDSO_64	0x2003
+#define ARCH_MAP_VDSO_X32		0x2001
+#define ARCH_MAP_VDSO_32		0x2002
+#define ARCH_MAP_VDSO_64		0x2003
 
 #endif /* _ASM_X86_PRCTL_H */
-- 
cgit v1.2.3


From fa68118144c63e292628945c5b8feb16b84fea7d Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 19 Jan 2022 13:30:23 -0500
Subject: kvm: selftests: sync uapi/linux/kvm.h with Linux header

KVM_CAP_XSAVE2 is out of sync due to a conflict.  Copy the whole
file while at it.

Reported-by: Yang Zhong <yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/include/uapi/linux/kvm.h | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index f066637ee206..9563d294f181 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -1131,7 +1131,8 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204
 #define KVM_CAP_ARM_MTE 205
 #define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206
-#define KVM_CAP_XSAVE2 207
+#define KVM_CAP_VM_GPA_BITS 207
+#define KVM_CAP_XSAVE2 208
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1163,11 +1164,20 @@ struct kvm_irq_routing_hv_sint {
 	__u32 sint;
 };
 
+struct kvm_irq_routing_xen_evtchn {
+	__u32 port;
+	__u32 vcpu;
+	__u32 priority;
+};
+
+#define KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL ((__u32)(-1))
+
 /* gsi routing entry types */
 #define KVM_IRQ_ROUTING_IRQCHIP 1
 #define KVM_IRQ_ROUTING_MSI 2
 #define KVM_IRQ_ROUTING_S390_ADAPTER 3
 #define KVM_IRQ_ROUTING_HV_SINT 4
+#define KVM_IRQ_ROUTING_XEN_EVTCHN 5
 
 struct kvm_irq_routing_entry {
 	__u32 gsi;
@@ -1179,6 +1189,7 @@ struct kvm_irq_routing_entry {
 		struct kvm_irq_routing_msi msi;
 		struct kvm_irq_routing_s390_adapter adapter;
 		struct kvm_irq_routing_hv_sint hv_sint;
+		struct kvm_irq_routing_xen_evtchn xen_evtchn;
 		__u32 pad[8];
 	} u;
 };
@@ -1209,6 +1220,7 @@ struct kvm_x86_mce {
 #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL	(1 << 1)
 #define KVM_XEN_HVM_CONFIG_SHARED_INFO		(1 << 2)
 #define KVM_XEN_HVM_CONFIG_RUNSTATE		(1 << 3)
+#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL	(1 << 4)
 
 struct kvm_xen_hvm_config {
 	__u32 flags;
@@ -1552,8 +1564,6 @@ struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_XSAVE */
 #define KVM_GET_XSAVE		  _IOR(KVMIO,  0xa4, struct kvm_xsave)
 #define KVM_SET_XSAVE		  _IOW(KVMIO,  0xa5, struct kvm_xsave)
-/* Available with KVM_CAP_XSAVE2 */
-#define KVM_GET_XSAVE2		  _IOR(KVMIO,  0xcf, struct kvm_xsave)
 /* Available with KVM_CAP_XCRS */
 #define KVM_GET_XCRS		  _IOR(KVMIO,  0xa6, struct kvm_xcrs)
 #define KVM_SET_XCRS		  _IOW(KVMIO,  0xa7, struct kvm_xcrs)
@@ -1613,6 +1623,9 @@ struct kvm_enc_region {
 #define KVM_S390_NORMAL_RESET	_IO(KVMIO,   0xc3)
 #define KVM_S390_CLEAR_RESET	_IO(KVMIO,   0xc4)
 
+/* Available with KVM_CAP_XSAVE2 */
+#define KVM_GET_XSAVE2		  _IOR(KVMIO,  0xcf, struct kvm_xsave)
+
 struct kvm_s390_pv_sec_parm {
 	__u64 origin;
 	__u64 length;
-- 
cgit v1.2.3


From 96c852c8bf52af2d34654e700d9b5d2e8a99bae5 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 19 Jan 2022 13:34:30 -0500
Subject: kvm: selftests: Do not indent with spaces

Some indentation with spaces crept in, likely due to terminal-based
cut and paste.  Clean it up.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 .../selftests/kvm/include/x86_64/processor.h       | 10 ++--
 tools/testing/selftests/kvm/lib/kvm_util.c         |  8 +--
 tools/testing/selftests/kvm/lib/x86_64/processor.c | 60 +++++++++++-----------
 tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c |  4 +-
 .../kvm/x86_64/vmx_close_while_nested_test.c       |  4 +-
 .../testing/selftests/kvm/x86_64/xen_shinfo_test.c | 34 ++++++------
 6 files changed, 61 insertions(+), 59 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index 122447827954..423d8a61bd2e 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -368,14 +368,14 @@ bool is_amd_cpu(void);
 
 static inline unsigned int x86_family(unsigned int eax)
 {
-        unsigned int x86;
+	unsigned int x86;
 
-        x86 = (eax >> 8) & 0xf;
+	x86 = (eax >> 8) & 0xf;
 
-        if (x86 == 0xf)
-                x86 += (eax >> 20) & 0xff;
+	if (x86 == 0xf)
+		x86 += (eax >> 20) & 0xff;
 
-        return x86;
+	return x86;
 }
 
 static inline unsigned int x86_model(unsigned int eax)
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index c22a17aac6b0..8c53f96ab7fe 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -499,9 +499,11 @@ void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log)
 void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log,
 			    uint64_t first_page, uint32_t num_pages)
 {
-	struct kvm_clear_dirty_log args = { .dirty_bitmap = log, .slot = slot,
-		                            .first_page = first_page,
-	                                    .num_pages = num_pages };
+	struct kvm_clear_dirty_log args = {
+		.dirty_bitmap = log, .slot = slot,
+		.first_page = first_page,
+		.num_pages = num_pages
+	};
 	int ret;
 
 	ret = ioctl(vm->fd, KVM_CLEAR_DIRTY_LOG, &args);
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index 59dcfe1967cc..5c8c270a9158 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -1144,25 +1144,25 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
 	list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0]));
 	list->nmsrs = nmsrs;
 	r = ioctl(vm->kvm_fd, KVM_GET_MSR_INDEX_LIST, list);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i",
-                r);
+	TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i",
+		    r);
 
 	state = malloc(sizeof(*state) + nmsrs * sizeof(state->msrs.entries[0]));
 	r = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, &state->events);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i",
-                r);
+	TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i",
+		    r);
 
 	r = ioctl(vcpu->fd, KVM_GET_MP_STATE, &state->mp_state);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MP_STATE, r: %i",
-                r);
+	TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MP_STATE, r: %i",
+		    r);
 
 	r = ioctl(vcpu->fd, KVM_GET_REGS, &state->regs);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i",
-                r);
+	TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i",
+		    r);
 
 	r = vcpu_save_xsave_state(vm, vcpu, state);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i",
-                r);
+	TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i",
+		    r);
 
 	if (kvm_check_cap(KVM_CAP_XCRS)) {
 		r = ioctl(vcpu->fd, KVM_GET_XCRS, &state->xcrs);
@@ -1171,17 +1171,17 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
 	}
 
 	r = ioctl(vcpu->fd, KVM_GET_SREGS, &state->sregs);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i",
-                r);
+	TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i",
+		    r);
 
 	if (nested_size) {
 		state->nested.size = sizeof(state->nested_);
 		r = ioctl(vcpu->fd, KVM_GET_NESTED_STATE, &state->nested);
 		TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_NESTED_STATE, r: %i",
-			r);
+			    r);
 		TEST_ASSERT(state->nested.size <= nested_size,
-			"Nested state size too big, %i (KVM_CHECK_CAP gave %i)",
-			state->nested.size, nested_size);
+			    "Nested state size too big, %i (KVM_CHECK_CAP gave %i)",
+			    state->nested.size, nested_size);
 	} else
 		state->nested.size = 0;
 
@@ -1189,12 +1189,12 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
 	for (i = 0; i < nmsrs; i++)
 		state->msrs.entries[i].index = list->indices[i];
 	r = ioctl(vcpu->fd, KVM_GET_MSRS, &state->msrs);
-        TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)",
-                r, r == nmsrs ? -1 : list->indices[r]);
+	TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)",
+		    r, r == nmsrs ? -1 : list->indices[r]);
 
 	r = ioctl(vcpu->fd, KVM_GET_DEBUGREGS, &state->debugregs);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i",
-                r);
+	TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i",
+		    r);
 
 	free(list);
 	return state;
@@ -1207,7 +1207,7 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *s
 
 	r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs);
 	TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_SREGS, r: %i",
-                r);
+		    r);
 
 	r = ioctl(vcpu->fd, KVM_SET_MSRS, &state->msrs);
 	TEST_ASSERT(r == state->msrs.nmsrs,
@@ -1222,28 +1222,28 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *s
 
 	r = ioctl(vcpu->fd, KVM_SET_XSAVE, state->xsave);
 	TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i",
-                r);
+		    r);
 
 	r = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, &state->events);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i",
-                r);
+	TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i",
+		    r);
 
 	r = ioctl(vcpu->fd, KVM_SET_MP_STATE, &state->mp_state);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_MP_STATE, r: %i",
-                r);
+	TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_MP_STATE, r: %i",
+		    r);
 
 	r = ioctl(vcpu->fd, KVM_SET_DEBUGREGS, &state->debugregs);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_DEBUGREGS, r: %i",
-                r);
+	TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_DEBUGREGS, r: %i",
+		    r);
 
 	r = ioctl(vcpu->fd, KVM_SET_REGS, &state->regs);
-        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_REGS, r: %i",
-                r);
+	TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_REGS, r: %i",
+		    r);
 
 	if (state->nested.size) {
 		r = ioctl(vcpu->fd, KVM_SET_NESTED_STATE, &state->nested);
 		TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_NESTED_STATE, r: %i",
-			r);
+			    r);
 	}
 }
 
diff --git a/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c
index 5a6a662f2e59..a426078b16a3 100644
--- a/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c
+++ b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c
@@ -77,8 +77,8 @@ static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid, int stage)
 	switch (get_ucall(vm, vcpuid, &uc)) {
 	case UCALL_SYNC:
 		TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
-                            uc.args[1] == stage + 1, "Stage %d: Unexpected register values vmexit, got %lx",
-                            stage + 1, (ulong)uc.args[1]);
+			    uc.args[1] == stage + 1, "Stage %d: Unexpected register values vmexit, got %lx",
+			    stage + 1, (ulong)uc.args[1]);
 		return;
 	case UCALL_DONE:
 		return;
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c
index 2835a17f1b7a..edac8839e717 100644
--- a/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c
+++ b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c
@@ -30,8 +30,8 @@ static struct kvm_vm *vm;
 static void l2_guest_code(void)
 {
 	/* Exit to L0 */
-        asm volatile("inb %%dx, %%al"
-                     : : [port] "d" (PORT_L0_EXIT) : "rax");
+	asm volatile("inb %%dx, %%al"
+		     : : [port] "d" (PORT_L0_EXIT) : "rax");
 }
 
 static void l1_guest_code(struct vmx_pages *vmx_pages)
diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
index 478e0ae8b93e..865e17146815 100644
--- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
+++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
@@ -46,20 +46,20 @@ static struct kvm_vm *vm;
 #define MIN_STEAL_TIME		50000
 
 struct pvclock_vcpu_time_info {
-        u32   version;
-        u32   pad0;
-        u64   tsc_timestamp;
-        u64   system_time;
-        u32   tsc_to_system_mul;
-        s8    tsc_shift;
-        u8    flags;
-        u8    pad[2];
+	u32   version;
+	u32   pad0;
+	u64   tsc_timestamp;
+	u64   system_time;
+	u32   tsc_to_system_mul;
+	s8    tsc_shift;
+	u8    flags;
+	u8    pad[2];
 } __attribute__((__packed__)); /* 32 bytes */
 
 struct pvclock_wall_clock {
-        u32   version;
-        u32   sec;
-        u32   nsec;
+	u32   version;
+	u32   sec;
+	u32   nsec;
 } __attribute__((__packed__));
 
 struct vcpu_runstate_info {
@@ -74,11 +74,11 @@ struct arch_vcpu_info {
 };
 
 struct vcpu_info {
-        uint8_t evtchn_upcall_pending;
-        uint8_t evtchn_upcall_mask;
-        unsigned long evtchn_pending_sel;
-        struct arch_vcpu_info arch;
-        struct pvclock_vcpu_time_info time;
+	uint8_t evtchn_upcall_pending;
+	uint8_t evtchn_upcall_mask;
+	unsigned long evtchn_pending_sel;
+	struct arch_vcpu_info arch;
+	struct pvclock_vcpu_time_info time;
 }; /* 64 bytes (x86) */
 
 struct shared_info {
@@ -493,7 +493,7 @@ int main(int argc, char *argv[])
 
 	vm_ts.tv_sec = wc->sec;
 	vm_ts.tv_nsec = wc->nsec;
-        TEST_ASSERT(wc->version && !(wc->version & 1),
+	TEST_ASSERT(wc->version && !(wc->version & 1),
 		    "Bad wallclock version %x", wc->version);
 	TEST_ASSERT(cmp_timespec(&min_ts, &vm_ts) <= 0, "VM time too old");
 	TEST_ASSERT(cmp_timespec(&max_ts, &vm_ts) >= 0, "VM time too new");
-- 
cgit v1.2.3


From 4cfb943537ed3716daf668ca5a33d3ce667f82a3 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Wed, 19 Jan 2022 18:08:36 -0800
Subject: tools/bpf/bpftool/skeleton: replace bpf_probe_read_kernel with
 bpf_probe_read_kernel_str to get task comm

bpf_probe_read_kernel_str() will add a nul terminator to the dst, then
we don't care about if the dst size is big enough.

Link: https://lkml.kernel.org/r/20211120112738.45980-7-laoar.shao@gmail.com
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Arnaldo Carvalho de Melo <arnaldo.melo@gmail.com>
Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: Michal Miroslaw <mirq-linux@rere.qmqm.pl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Kees Cook <keescook@chromium.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/bpf/bpftool/skeleton/pid_iter.bpf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/skeleton/pid_iter.bpf.c b/tools/bpf/bpftool/skeleton/pid_iter.bpf.c
index d9b420972934..f70702fcb224 100644
--- a/tools/bpf/bpftool/skeleton/pid_iter.bpf.c
+++ b/tools/bpf/bpftool/skeleton/pid_iter.bpf.c
@@ -71,8 +71,8 @@ int iter(struct bpf_iter__task_file *ctx)
 
 	e.pid = task->tgid;
 	e.id = get_obj_id(file->private_data, obj_type);
-	bpf_probe_read_kernel(&e.comm, sizeof(e.comm),
-			      task->group_leader->comm);
+	bpf_probe_read_kernel_str(&e.comm, sizeof(e.comm),
+				  task->group_leader->comm);
 	bpf_seq_write(ctx->meta->seq, &e, sizeof(e));
 
 	return 0;
-- 
cgit v1.2.3


From 3087c61ed2c48548b74dd343a5209b87082c682d Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Wed, 19 Jan 2022 18:08:40 -0800
Subject: tools/testing/selftests/bpf: replace open-coded 16 with TASK_COMM_LEN

As the sched:sched_switch tracepoint args are derived from the kernel,
we'd better make it same with the kernel.  So the macro TASK_COMM_LEN is
converted to type enum, then all the BPF programs can get it through
BTF.

The BPF program which wants to use TASK_COMM_LEN should include the
header vmlinux.h.  Regarding the test_stacktrace_map and
test_tracepoint, as the type defined in linux/bpf.h are also defined in
vmlinux.h, so we don't need to include linux/bpf.h again.

Link: https://lkml.kernel.org/r/20211120112738.45980-8-laoar.shao@gmail.com
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Arnaldo Carvalho de Melo <arnaldo.melo@gmail.com>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: Michal Miroslaw <mirq-linux@rere.qmqm.pl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Kees Cook <keescook@chromium.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h                                   | 9 +++++++--
 tools/testing/selftests/bpf/progs/test_stacktrace_map.c | 6 +++---
 tools/testing/selftests/bpf/progs/test_tracepoint.c     | 6 +++---
 3 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 78c351e35fec..cecd4806edc6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -274,8 +274,13 @@ struct task_group;
 
 #define get_current_state()	READ_ONCE(current->__state)
 
-/* Task command name length: */
-#define TASK_COMM_LEN			16
+/*
+ * Define the task command name length as enum, then it can be visible to
+ * BPF programs.
+ */
+enum {
+	TASK_COMM_LEN = 16,
+};
 
 extern void scheduler_tick(void);
 
diff --git a/tools/testing/selftests/bpf/progs/test_stacktrace_map.c b/tools/testing/selftests/bpf/progs/test_stacktrace_map.c
index a8233e7f173b..728dbd39eff0 100644
--- a/tools/testing/selftests/bpf/progs/test_stacktrace_map.c
+++ b/tools/testing/selftests/bpf/progs/test_stacktrace_map.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 // Copyright (c) 2018 Facebook
 
-#include <linux/bpf.h>
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 
 #ifndef PERF_MAX_STACK_DEPTH
@@ -41,11 +41,11 @@ struct {
 /* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */
 struct sched_switch_args {
 	unsigned long long pad;
-	char prev_comm[16];
+	char prev_comm[TASK_COMM_LEN];
 	int prev_pid;
 	int prev_prio;
 	long long prev_state;
-	char next_comm[16];
+	char next_comm[TASK_COMM_LEN];
 	int next_pid;
 	int next_prio;
 };
diff --git a/tools/testing/selftests/bpf/progs/test_tracepoint.c b/tools/testing/selftests/bpf/progs/test_tracepoint.c
index ce6974016f53..43bd7a20cc50 100644
--- a/tools/testing/selftests/bpf/progs/test_tracepoint.c
+++ b/tools/testing/selftests/bpf/progs/test_tracepoint.c
@@ -1,17 +1,17 @@
 // SPDX-License-Identifier: GPL-2.0
 // Copyright (c) 2017 Facebook
 
-#include <linux/bpf.h>
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 
 /* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */
 struct sched_switch_args {
 	unsigned long long pad;
-	char prev_comm[16];
+	char prev_comm[TASK_COMM_LEN];
 	int prev_pid;
 	int prev_prio;
 	long long prev_state;
-	char next_comm[16];
+	char next_comm[TASK_COMM_LEN];
 	int next_pid;
 	int next_prio;
 };
-- 
cgit v1.2.3


From fd0a1462405b087377e59b84e119fe7e2d08499a Mon Sep 17 00:00:00 2001
From: Isabella Basso <isabbasso@riseup.net>
Date: Wed, 19 Jan 2022 18:09:02 -0800
Subject: hash.h: remove unused define directive
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "test_hash.c: refactor into KUnit", v3.

We refactored the lib/test_hash.c file into KUnit as part of the student
group LKCAMP [1] introductory hackathon for kernel development.

This test was pointed to our group by Daniel Latypov [2], so its full
conversion into a pure KUnit test was our goal in this patch series, but
we ran into many problems relating to it not being split as unit tests,
which complicated matters a bit, as the reasoning behind the original
tests is quite cryptic for those unfamiliar with hash implementations.

Some interesting developments we'd like to highlight are:

 - In patch 1/5 we noticed that there was an unused define directive
   that could be removed.

 - In patch 4/5 we noticed how stringhash and hash tests are all under
   the lib/test_hash.c file, which might cause some confusion, and we
   also broke those kernel config entries up.

Overall KUnit developments have been made in the other patches in this
series:

In patches 2/5, 3/5 and 5/5 we refactored the lib/test_hash.c file so as
to make it more compatible with the KUnit style, whilst preserving the
original idea of the maintainer who designed it (i.e.  George Spelvin),
which might be undesirable for unit tests, but we assume it is enough
for a first patch.

This patch (of 5):

Currently, there exist hash_32() and __hash_32() functions, which were
introduced in a patch [1] targeting architecture specific optimizations.
These functions can be overridden on a per-architecture basis to achieve
such optimizations.  They must set their corresponding define directive
(HAVE_ARCH_HASH_32 and HAVE_ARCH__HASH_32, respectively) so that header
files can deal with these overrides properly.

As the supported 32-bit architectures that have their own hash function
implementation (i.e.  m68k, Microblaze, H8/300, pa-risc) have only been
making use of the (more general) __hash_32() function (which only lacks
a right shift operation when compared to the hash_32() function), remove
the define directive corresponding to the arch-specific hash_32()
implementation.

[1] https://lore.kernel.org/lkml/20160525073311.5600.qmail@ns.sciencehorizons.net/

[akpm@linux-foundation.org: hash_32_generic() becomes hash_32()]

Link: https://lkml.kernel.org/r/20211208183711.390454-1-isabbasso@riseup.net
Link: https://lkml.kernel.org/r/20211208183711.390454-2-isabbasso@riseup.net
Reviewed-by: David Gow <davidgow@google.com>
Tested-by: David Gow <davidgow@google.com>
Co-developed-by: Augusto Durães Camargo <augusto.duraes33@gmail.com>
Signed-off-by: Augusto Durães Camargo <augusto.duraes33@gmail.com>
Co-developed-by: Enzo Ferreira <ferreiraenzoa@gmail.com>
Signed-off-by: Enzo Ferreira <ferreiraenzoa@gmail.com>
Signed-off-by: Isabella Basso <isabbasso@riseup.net>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Daniel Latypov <dlatypov@google.com>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Rodrigo Siqueira <rodrigosiqueiramelo@gmail.com>
Cc: kernel test robot <lkp@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/infiniband/sw/rxe/rxe_qp.c |  3 +--
 include/linux/hash.h               |  5 +----
 lib/test_hash.c                    | 24 +-----------------------
 tools/include/linux/hash.h         |  5 +----
 4 files changed, 4 insertions(+), 33 deletions(-)

(limited to 'tools')

diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c
index 54b8711321c1..44c9ea601bff 100644
--- a/drivers/infiniband/sw/rxe/rxe_qp.c
+++ b/drivers/infiniband/sw/rxe/rxe_qp.c
@@ -217,8 +217,7 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp,
 	 * the port number must be in the Dynamic Ports range
 	 * (0xc000 - 0xffff).
 	 */
-	qp->src_port = RXE_ROCE_V2_SPORT +
-		(hash_32_generic(qp_num(qp), 14) & 0x3fff);
+	qp->src_port = RXE_ROCE_V2_SPORT + (hash_32(qp_num(qp), 14) & 0x3fff);
 	qp->sq.max_wr		= init->cap.max_send_wr;
 
 	/* These caps are limited by rxe_qp_chk_cap() done by the caller */
diff --git a/include/linux/hash.h b/include/linux/hash.h
index ad6fa21d977b..38edaa08f862 100644
--- a/include/linux/hash.h
+++ b/include/linux/hash.h
@@ -62,10 +62,7 @@ static inline u32 __hash_32_generic(u32 val)
 	return val * GOLDEN_RATIO_32;
 }
 
-#ifndef HAVE_ARCH_HASH_32
-#define hash_32 hash_32_generic
-#endif
-static inline u32 hash_32_generic(u32 val, unsigned int bits)
+static inline u32 hash_32(u32 val, unsigned int bits)
 {
 	/* High bits are more random, so use them. */
 	return __hash_32(val) >> (32 - bits);
diff --git a/lib/test_hash.c b/lib/test_hash.c
index 0ee40b4a56dd..d4b0cfdb0377 100644
--- a/lib/test_hash.c
+++ b/lib/test_hash.c
@@ -94,22 +94,7 @@ test_int_hash(unsigned long long h64, u32 hash_or[2][33])
 			pr_err("hash_32(%#x, %d) = %#x > %#x", h0, k, h1, m);
 			return false;
 		}
-#ifdef HAVE_ARCH_HASH_32
-		h2 = hash_32_generic(h0, k);
-#if HAVE_ARCH_HASH_32 == 1
-		if (h1 != h2) {
-			pr_err("hash_32(%#x, %d) = %#x != hash_32_generic() "
-				" = %#x", h0, k, h1, h2);
-			return false;
-		}
-#else
-		if (h2 > m) {
-			pr_err("hash_32_generic(%#x, %d) = %#x > %#x",
-				h0, k, h1, m);
-			return false;
-		}
-#endif
-#endif
+
 		/* Test hash_64 */
 		hash_or[1][k] |= h1 = hash_64(h64, k);
 		if (h1 > m) {
@@ -227,13 +212,6 @@ test_hash_init(void)
 #else
 	pr_info("__hash_32() has no arch implementation to test.");
 #endif
-#ifdef HAVE_ARCH_HASH_32
-#if HAVE_ARCH_HASH_32 != 1
-	pr_info("hash_32() is arch-specific; not compared to generic.");
-#endif
-#else
-	pr_info("hash_32() has no arch implementation to test.");
-#endif
 #ifdef HAVE_ARCH_HASH_64
 #if HAVE_ARCH_HASH_64 != 1
 	pr_info("hash_64() is arch-specific; not compared to generic.");
diff --git a/tools/include/linux/hash.h b/tools/include/linux/hash.h
index ad6fa21d977b..38edaa08f862 100644
--- a/tools/include/linux/hash.h
+++ b/tools/include/linux/hash.h
@@ -62,10 +62,7 @@ static inline u32 __hash_32_generic(u32 val)
 	return val * GOLDEN_RATIO_32;
 }
 
-#ifndef HAVE_ARCH_HASH_32
-#define hash_32 hash_32_generic
-#endif
-static inline u32 hash_32_generic(u32 val, unsigned int bits)
+static inline u32 hash_32(u32 val, unsigned int bits)
 {
 	/* High bits are more random, so use them. */
 	return __hash_32(val) >> (32 - bits);
-- 
cgit v1.2.3


From 5bf18281534451bf1ad56a45a3085cd7ad46860d Mon Sep 17 00:00:00 2001
From: wangyong <wang.yong12@zte.com.cn>
Date: Wed, 19 Jan 2022 18:10:15 -0800
Subject: delayacct: track delays from memory compact

Delay accounting does not track the delay of memory compact.  When there
is not enough free memory, tasks can spend a amount of their time
waiting for compact.

To get the impact of tasks in direct memory compact, measure the delay
when allocating memory through memory compact.

Also update tools/accounting/getdelays.c:

    / # ./getdelays_next  -di -p 304
    print delayacct stats ON
    printing IO accounting
    PID     304

    CPU             count     real total  virtual total    delay total  delay average
                      277      780000000      849039485       18877296          0.068ms
    IO              count    delay total  delay average
                        0              0              0ms
    SWAP            count    delay total  delay average
                        0              0              0ms
    RECLAIM         count    delay total  delay average
                        5    11088812685           2217ms
    THRASHING       count    delay total  delay average
                        0              0              0ms
    COMPACT         count    delay total  delay average
                        3          72758              0ms
    watch: read=0, write=0, cancelled_write=0

Link: https://lkml.kernel.org/r/1638619795-71451-1-git-send-email-wang.yong12@zte.com.cn
Signed-off-by: wangyong <wang.yong12@zte.com.cn>
Reviewed-by: Jiang Xuexin <jiang.xuexin@zte.com.cn>
Reviewed-by: Zhang Wenya <zhang.wenya1@zte.com.cn>
Reviewed-by: Yang Yang <yang.yang29@zte.com.cn>
Reviewed-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/delayacct.h      | 28 ++++++++++++++++++++++++++++
 include/uapi/linux/taskstats.h |  6 +++++-
 kernel/delayacct.c             | 16 ++++++++++++++++
 mm/page_alloc.c                |  3 +++
 tools/accounting/getdelays.c   |  8 +++++++-
 5 files changed, 59 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 435c3654a0ff..3e03d010bd2e 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -42,8 +42,12 @@ struct task_delay_info {
 	u64 thrashing_start;
 	u64 thrashing_delay;	/* wait for thrashing page */
 
+	u64 compact_start;
+	u64 compact_delay;	/* wait for memory compact */
+
 	u32 freepages_count;	/* total count of memory reclaim */
 	u32 thrashing_count;	/* total count of thrash waits */
+	u32 compact_count;	/* total count of memory compact */
 };
 #endif
 
@@ -72,6 +76,8 @@ extern void __delayacct_thrashing_start(void);
 extern void __delayacct_thrashing_end(void);
 extern void __delayacct_swapin_start(void);
 extern void __delayacct_swapin_end(void);
+extern void __delayacct_compact_start(void);
+extern void __delayacct_compact_end(void);
 
 static inline void delayacct_tsk_init(struct task_struct *tsk)
 {
@@ -170,6 +176,24 @@ static inline void delayacct_swapin_end(void)
 		__delayacct_swapin_end();
 }
 
+static inline void delayacct_compact_start(void)
+{
+	if (!static_branch_unlikely(&delayacct_key))
+		return;
+
+	if (current->delays)
+		__delayacct_compact_start();
+}
+
+static inline void delayacct_compact_end(void)
+{
+	if (!static_branch_unlikely(&delayacct_key))
+		return;
+
+	if (current->delays)
+		__delayacct_compact_end();
+}
+
 #else
 static inline void delayacct_init(void)
 {}
@@ -200,6 +224,10 @@ static inline void delayacct_swapin_start(void)
 {}
 static inline void delayacct_swapin_end(void)
 {}
+static inline void delayacct_compact_start(void)
+{}
+static inline void delayacct_compact_end(void)
+{}
 
 #endif /* CONFIG_TASK_DELAY_ACCT */
 
diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h
index ccbd08709321..12327d32378f 100644
--- a/include/uapi/linux/taskstats.h
+++ b/include/uapi/linux/taskstats.h
@@ -34,7 +34,7 @@
  */
 
 
-#define TASKSTATS_VERSION	10
+#define TASKSTATS_VERSION	11
 #define TS_COMM_LEN		32	/* should be >= TASK_COMM_LEN
 					 * in linux/sched.h */
 
@@ -172,6 +172,10 @@ struct taskstats {
 
 	/* v10: 64-bit btime to avoid overflow */
 	__u64	ac_btime64;		/* 64-bit begin time */
+
+	/* Delay waiting for memory compact */
+	__u64	compact_count;
+	__u64	compact_delay_total;
 };
 
 
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 97699848c1f0..c5e8cea9e05f 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -155,10 +155,13 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp;
 	tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay;
 	d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp;
+	tmp = d->compact_delay_total + tsk->delays->compact_delay;
+	d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp;
 	d->blkio_count += tsk->delays->blkio_count;
 	d->swapin_count += tsk->delays->swapin_count;
 	d->freepages_count += tsk->delays->freepages_count;
 	d->thrashing_count += tsk->delays->thrashing_count;
+	d->compact_count += tsk->delays->compact_count;
 	raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
 
 	return 0;
@@ -213,3 +216,16 @@ void __delayacct_swapin_end(void)
 		      &current->delays->swapin_delay,
 		      &current->delays->swapin_count);
 }
+
+void __delayacct_compact_start(void)
+{
+	current->delays->compact_start = local_clock();
+}
+
+void __delayacct_compact_end(void)
+{
+	delayacct_end(&current->delays->lock,
+		      &current->delays->compact_start,
+		      &current->delays->compact_delay,
+		      &current->delays->compact_count);
+}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c5952749ad40..635063f49671 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -72,6 +72,7 @@
 #include <linux/padata.h>
 #include <linux/khugepaged.h>
 #include <linux/buffer_head.h>
+#include <linux/delayacct.h>
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -4348,6 +4349,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		return NULL;
 
 	psi_memstall_enter(&pflags);
+	delayacct_compact_start();
 	noreclaim_flag = memalloc_noreclaim_save();
 
 	*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
@@ -4355,6 +4357,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 
 	memalloc_noreclaim_restore(noreclaim_flag);
 	psi_memstall_leave(&pflags);
+	delayacct_compact_end();
 
 	if (*compact_result == COMPACT_SKIPPED)
 		return NULL;
diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c
index 5ef1c15e88ad..11e86739456d 100644
--- a/tools/accounting/getdelays.c
+++ b/tools/accounting/getdelays.c
@@ -205,6 +205,8 @@ static void print_delayacct(struct taskstats *t)
 	       "RECLAIM  %12s%15s%15s\n"
 	       "      %15llu%15llu%15llums\n"
 	       "THRASHING%12s%15s%15s\n"
+	       "      %15llu%15llu%15llums\n"
+	       "COMPACT  %12s%15s%15s\n"
 	       "      %15llu%15llu%15llums\n",
 	       "count", "real total", "virtual total",
 	       "delay total", "delay average",
@@ -228,7 +230,11 @@ static void print_delayacct(struct taskstats *t)
 	       "count", "delay total", "delay average",
 	       (unsigned long long)t->thrashing_count,
 	       (unsigned long long)t->thrashing_delay_total,
-	       average_ms(t->thrashing_delay_total, t->thrashing_count));
+	       average_ms(t->thrashing_delay_total, t->thrashing_count),
+	       "count", "delay total", "delay average",
+	       (unsigned long long)t->compact_count,
+	       (unsigned long long)t->compact_delay_total,
+	       average_ms(t->compact_delay_total, t->compact_count));
 }
 
 static void task_context_switch_counts(struct taskstats *t)
-- 
cgit v1.2.3


From 6e10e21915c1ab6eaa145f7b5ebaf4500af1b011 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 8 Sep 2021 16:09:08 -0300
Subject: tools headers UAPI: Sync files changed by new set_mempolicy_home_node
 syscall

To pick the changes in these csets:

  21b084fdf2a49ca1 ("mm/mempolicy: wire up syscall set_mempolicy_home_node")

That add support for this new syscall in tools such as 'perf trace'.

For instance, this is now possible:

  [root@five ~]# perf trace -e set_mempolicy_home_node
  ^C[root@five ~]#
  [root@five ~]# perf trace -v -e set_mempolicy_home_node
  Using CPUID AuthenticAMD-25-21-0
  event qualifier tracepoint filter: (common_pid != 253729 && common_pid != 3585) && (id == 450)
  mmap size 528384B
  ^C[root@five ~]
  [root@five ~]# perf trace -v -e set*  --max-events 5
  Using CPUID AuthenticAMD-25-21-0
  event qualifier tracepoint filter: (common_pid != 253734 && common_pid != 3585) && (id == 38 || id == 54 || id == 105 || id == 106 || id == 109 || id == 112 || id == 113 || id == 114 || id == 116 || id == 117 || id == 119 || id == 122 || id == 123 || id == 141 || id == 160 || id == 164 || id == 170 || id == 171 || id == 188 || id == 205 || id == 218 || id == 238 || id == 273 || id == 308 || id == 450)
  mmap size 528384B
       0.000 ( 0.008 ms): bash/253735 setpgid(pid: 253735 (bash), pgid: 253735 (bash))      = 0
    6849.011 ( 0.008 ms): bash/16046 setpgid(pid: 253736 (bash), pgid: 253736 (bash))       = 0
    6849.080 ( 0.005 ms): bash/253736 setpgid(pid: 253736 (bash), pgid: 253736 (bash))      = 0
    7437.718 ( 0.009 ms): gnome-shell/253737 set_robust_list(head: 0x7f34b527e920, len: 24) = 0
   13445.986 ( 0.010 ms): bash/16046 setpgid(pid: 253738 (bash), pgid: 253738 (bash))       = 0
  [root@five ~]#

That is the filter expression attached to the raw_syscalls:sys_{enter,exit}
tracepoints.

  $ find tools/perf/arch/ -name "syscall*tbl" | xargs grep -w set_mempolicy_home_node
  tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl:450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
  tools/perf/arch/powerpc/entry/syscalls/syscall.tbl:450 	nospu	set_mempolicy_home_node		sys_set_mempolicy_home_node
  tools/perf/arch/s390/entry/syscalls/syscall.tbl:450  common	set_mempolicy_home_node	sys_set_mempolicy_home_node	sys_set_mempolicy_home_node
  tools/perf/arch/x86/entry/syscalls/syscall_64.tbl:450	common	set_mempolicy_home_node	sys_set_mempolicy_home_node
  $

  $ grep -w set_mempolicy_home_node /tmp/build/perf/arch/x86/include/generated/asm/syscalls_64.c
	[450] = "set_mempolicy_home_node",
  $

This addresses these perf build warnings:

  Warning: Kernel ABI header at 'tools/include/uapi/asm-generic/unistd.h' differs from latest version at 'include/uapi/asm-generic/unistd.h'
  diff -u tools/include/uapi/asm-generic/unistd.h include/uapi/asm-generic/unistd.h
  Warning: Kernel ABI header at 'tools/perf/arch/x86/entry/syscalls/syscall_64.tbl' differs from latest version at 'arch/x86/entry/syscalls/syscall_64.tbl'
  diff -u tools/perf/arch/x86/entry/syscalls/syscall_64.tbl arch/x86/entry/syscalls/syscall_64.tbl
  Warning: Kernel ABI header at 'tools/perf/arch/powerpc/entry/syscalls/syscall.tbl' differs from latest version at 'arch/powerpc/kernel/syscalls/syscall.tbl'
  diff -u tools/perf/arch/powerpc/entry/syscalls/syscall.tbl arch/powerpc/kernel/syscalls/syscall.tbl
  Warning: Kernel ABI header at 'tools/perf/arch/s390/entry/syscalls/syscall.tbl' differs from latest version at 'arch/s390/kernel/syscalls/syscall.tbl'
  diff -u tools/perf/arch/s390/entry/syscalls/syscall.tbl arch/s390/kernel/syscalls/syscall.tbl
  Warning: Kernel ABI header at 'tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl' differs from latest version at 'arch/mips/kernel/syscalls/syscall_n64.tbl'
  diff -u tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl arch/mips/kernel/syscalls/syscall_n64.tbl

Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/include/uapi/asm-generic/unistd.h             | 5 ++++-
 tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl | 1 +
 tools/perf/arch/powerpc/entry/syscalls/syscall.tbl  | 1 +
 tools/perf/arch/s390/entry/syscalls/syscall.tbl     | 1 +
 tools/perf/arch/x86/entry/syscalls/syscall_64.tbl   | 1 +
 5 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h
index 4557a8b6086f..1c48b0ae3ba3 100644
--- a/tools/include/uapi/asm-generic/unistd.h
+++ b/tools/include/uapi/asm-generic/unistd.h
@@ -883,8 +883,11 @@ __SYSCALL(__NR_process_mrelease, sys_process_mrelease)
 #define __NR_futex_waitv 449
 __SYSCALL(__NR_futex_waitv, sys_futex_waitv)
 
+#define __NR_set_mempolicy_home_node 450
+__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
+
 #undef __NR_syscalls
-#define __NR_syscalls 450
+#define __NR_syscalls 451
 
 /*
  * 32 bit systems traditionally used different
diff --git a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl
index e2c481fcede6..3f1886ad9d80 100644
--- a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl
+++ b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl
@@ -364,3 +364,4 @@
 # 447 reserved for memfd_secret
 448	n64	process_mrelease		sys_process_mrelease
 449	n64	futex_waitv			sys_futex_waitv
+450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
index 15109af9d075..2600b4237292 100644
--- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
@@ -529,3 +529,4 @@
 # 447 reserved for memfd_secret
 448	common	process_mrelease		sys_process_mrelease
 449	common  futex_waitv                     sys_futex_waitv
+450 	nospu	set_mempolicy_home_node		sys_set_mempolicy_home_node
diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
index ed9c5c2eafad..799147658dee 100644
--- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
@@ -452,3 +452,4 @@
 # 447 reserved for memfd_secret
 448  common	process_mrelease	sys_process_mrelease		sys_process_mrelease
 449  common	futex_waitv		sys_futex_waitv			sys_futex_waitv
+450  common	set_mempolicy_home_node	sys_set_mempolicy_home_node	sys_set_mempolicy_home_node
diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
index fe8f8dd157b4..c84d12608cd2 100644
--- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
@@ -371,6 +371,7 @@
 447	common	memfd_secret		sys_memfd_secret
 448	common	process_mrelease	sys_process_mrelease
 449	common	futex_waitv		sys_futex_waitv
+450	common	set_mempolicy_home_node	sys_set_mempolicy_home_node
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
-- 
cgit v1.2.3


From a0f4ba7f51ea736a6b4ccf58563507d7af9128fb Mon Sep 17 00:00:00 2001
From: Jinrong Liang <cloudliang@tencent.com>
Date: Wed, 19 Jan 2022 21:39:10 +0800
Subject: selftests: kvm/x86: Fix the warning in pmu_event_filter_test.c

The following warning appears when executing
make -C tools/testing/selftests/kvm

x86_64/pmu_event_filter_test.c: In function 'vcpu_supports_intel_br_retired':
x86_64/pmu_event_filter_test.c:241:28: warning: variable 'cpuid' set but not used [-Wunused-but-set-variable]
  241 |         struct kvm_cpuid2 *cpuid;
      |                            ^~~~~
x86_64/pmu_event_filter_test.c: In function 'vcpu_supports_amd_zen_br_retired':
x86_64/pmu_event_filter_test.c:258:28: warning: variable 'cpuid' set but not used [-Wunused-but-set-variable]
  258 |         struct kvm_cpuid2 *cpuid;
      |                            ^~~~~

Just delete the unused variables to stay away from warnings.

Fixes: dc7e75b3b3ee ("selftests: kvm/x86: Add test for KVM_SET_PMU_EVENT_FILTER")
Signed-off-by: Jinrong Liang <cloudliang@tencent.com>
Message-Id: <20220119133910.56285-1-cloudliang@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
index aa104946e6e0..c715adcbd487 100644
--- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
+++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
@@ -347,9 +347,7 @@ static bool check_intel_pmu_leaf(struct kvm_cpuid_entry2 *entry)
 static bool use_intel_pmu(void)
 {
 	struct kvm_cpuid_entry2 *entry;
-	struct kvm_cpuid2 *cpuid;
 
-	cpuid = kvm_get_supported_cpuid();
 	entry = kvm_get_supported_cpuid_index(0xa, 0);
 	return is_intel_cpu() && entry && check_intel_pmu_leaf(entry);
 }
@@ -381,9 +379,7 @@ static bool is_zen3(uint32_t eax)
 static bool use_amd_pmu(void)
 {
 	struct kvm_cpuid_entry2 *entry;
-	struct kvm_cpuid2 *cpuid;
 
-	cpuid = kvm_get_supported_cpuid();
 	entry = kvm_get_supported_cpuid_index(1, 0);
 	return is_amd_cpu() && entry &&
 		(is_zen1(entry->eax) ||
-- 
cgit v1.2.3


From 83a34ad848937462aa64fa3d48f8c0b4034f2503 Mon Sep 17 00:00:00 2001
From: Jinrong Liang <cloudliang@tencent.com>
Date: Wed, 19 Jan 2022 22:03:25 +0800
Subject: selftests: kvm/x86: Fix the warning in lib/x86_64/processor.c

The following warning appears when executing
make -C tools/testing/selftests/kvm

include/x86_64/processor.h:290:2: warning: 'ecx' may be used uninitialized in this
function [-Wmaybe-uninitialized]
  asm volatile("cpuid"
  ^~~
lib/x86_64/processor.c:1523:21: note: 'ecx' was declared here
  uint32_t eax, ebx, ecx, edx, max_ext_leaf;

Just initialize ecx to remove this warning.

Fixes: c8cc43c1eae2 ("selftests: KVM: avoid failures due to reserved HyperTransport region")
Signed-off-by: Jinrong Liang <cloudliang@tencent.com>
Message-Id: <20220119140325.59369-1-cloudliang@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/lib/x86_64/processor.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index 5c8c270a9158..5f9d7e91dc69 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -1535,6 +1535,7 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
 	/* Before family 17h, the HyperTransport area is just below 1T.  */
 	ht_gfn = (1 << 28) - num_ht_pages;
 	eax = 1;
+	ecx = 0;
 	cpuid(&eax, &ebx, &ecx, &edx);
 	if (x86_family(eax) < 0x17)
 		goto done;
-- 
cgit v1.2.3


From 9846921dba4936d92f7608315b5d1e0a8ec3a538 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Thu, 20 Jan 2022 16:35:29 -0800
Subject: selftests: mptcp: fix ipv6 routing setup

MPJ ipv6 selftests currently lack per link route to the server
net. Additionally, ipv6 subflows endpoints are created without any
interface specified. The end-result is that in ipv6 self-tests
subflows are created all on the same link, leading to expected delays
and sporadic self-tests failures.

Fix the issue by adding the missing setup bits.

Fixes: 523514ed0a99 ("selftests: mptcp: add ADD_ADDR IPv6 test cases")
Reported-and-tested-by: Geliang Tang <geliang.tang@suse.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_join.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 27d0eb9afdca..b8bdbec0cf69 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -75,6 +75,7 @@ init()
 
 		# let $ns2 reach any $ns1 address from any interface
 		ip -net "$ns2" route add default via 10.0.$i.1 dev ns2eth$i metric 10$i
+		ip -net "$ns2" route add default via dead:beef:$i::1 dev ns2eth$i metric 10$i
 	done
 }
 
@@ -1476,7 +1477,7 @@ ipv6_tests()
 	reset
 	ip netns exec $ns1 ./pm_nl_ctl limits 0 1
 	ip netns exec $ns2 ./pm_nl_ctl limits 0 1
-	ip netns exec $ns2 ./pm_nl_ctl add dead:beef:3::2 flags subflow
+	ip netns exec $ns2 ./pm_nl_ctl add dead:beef:3::2 dev ns2eth3 flags subflow
 	run_tests $ns1 $ns2 dead:beef:1::1 0 0 0 slow
 	chk_join_nr "single subflow IPv6" 1 1 1
 
@@ -1511,7 +1512,7 @@ ipv6_tests()
 	ip netns exec $ns1 ./pm_nl_ctl limits 0 2
 	ip netns exec $ns1 ./pm_nl_ctl add dead:beef:2::1 flags signal
 	ip netns exec $ns2 ./pm_nl_ctl limits 1 2
-	ip netns exec $ns2 ./pm_nl_ctl add dead:beef:3::2 flags subflow
+	ip netns exec $ns2 ./pm_nl_ctl add dead:beef:3::2 dev ns2eth3 flags subflow
 	run_tests $ns1 $ns2 dead:beef:1::1 0 -1 -1 slow
 	chk_join_nr "remove subflow and signal IPv6" 2 2 2
 	chk_add_nr 1 1
-- 
cgit v1.2.3


From afa114d987c40e72ebbbc36bedf7d66b7cdc5883 Mon Sep 17 00:00:00 2001
From: Justin Iurman <justin.iurman@uliege.be>
Date: Fri, 21 Jan 2022 18:34:49 +0100
Subject: selftests: net: ioam: expect support for Queue depth data

The IOAM queue-depth data field was added a few weeks ago,
but the test unit was not updated accordingly.

Reported-by: kernel test robot <oliver.sang@intel.com>
Fixes: b63c5478e9cb ("ipv6: ioam: Support for Queue depth data field")
Signed-off-by: Justin Iurman <justin.iurman@uliege.be>
Link: https://lore.kernel.org/r/20220121173449.26918-1-justin.iurman@uliege.be
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/ioam6_parser.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/ioam6_parser.c b/tools/testing/selftests/net/ioam6_parser.c
index 8f6997d35816..d9d1d4190126 100644
--- a/tools/testing/selftests/net/ioam6_parser.c
+++ b/tools/testing/selftests/net/ioam6_parser.c
@@ -240,11 +240,8 @@ static int check_ioam6_data(__u8 **p, struct ioam6_trace_hdr *ioam6h,
 		*p += sizeof(__u32);
 	}
 
-	if (ioam6h->type.bit6) {
-		if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff)
-			return 1;
+	if (ioam6h->type.bit6)
 		*p += sizeof(__u32);
-	}
 
 	if (ioam6h->type.bit7) {
 		if (__be32_to_cpu(*((__u32 *)*p)) != 0xffffffff)
-- 
cgit v1.2.3


From 9edcde68d653e1f8f895fbb69a0043c6a56ae35e Mon Sep 17 00:00:00 2001
From: Yao Jin <jinyao5@huawei.com>
Date: Fri, 21 Jan 2022 14:59:54 +0800
Subject: perf script: Fix printing 'phys_addr' failure issue

Perf script was failed to print the phys_addr for SPE profiling.
One 'dummy' event is added by SPE profiling but it doesn't have PHYS_ADDR
attribute set, perf script then exits with error.

Now referring to 'addr', use evsel__do_check_stype() to check the type.

Before:

  # perf record -e arm_spe_0/branch_filter=0,ts_enable=1,pa_enable=1,load_filter=1,jitter=0,\
		store_filter=0,min_latency=0,event_filter=2/ -p 4064384 -- sleep 3
  # perf script -F pid,tid,addr,phys_addr
  Samples for 'dummy:u' event do not have PHYS_ADDR attribute set. Cannot print 'phys_addr' field.

After:

  # perf record -e arm_spe_0/branch_filter=0,ts_enable=1,pa_enable=1,load_filter=1,jitter=0,\
		store_filter=0,min_latency=0,event_filter=2/ -p 4064384 -- sleep 3
  # perf script -F pid,tid,addr,phys_addr
  4064384/4064384 ffff802f921be0d0      2f921be0d0
  4064384/4064384 ffff802f921be0d0      2f921be0d0

Reviewed-by: German Gomez <german.gomez@arm.com>
Signed-off-by: Yao Jin <jinyao5@huawei.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lore.kernel.org/lkml/20220121065954.2121900-1-liwei391@huawei.com
Signed-off-by: Wei Li <liwei391@huawei.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-script.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index ecd4f99a6c14..abae8184e171 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -515,7 +515,7 @@ static int evsel__check_attr(struct evsel *evsel, struct perf_session *session)
 		return -EINVAL;
 
 	if (PRINT_FIELD(PHYS_ADDR) &&
-	    evsel__check_stype(evsel, PERF_SAMPLE_PHYS_ADDR, "PHYS_ADDR", PERF_OUTPUT_PHYS_ADDR))
+	    evsel__do_check_stype(evsel, PERF_SAMPLE_PHYS_ADDR, "PHYS_ADDR", PERF_OUTPUT_PHYS_ADDR, allow_user_set))
 		return -EINVAL;
 
 	if (PRINT_FIELD(DATA_PAGE_SIZE) &&
-- 
cgit v1.2.3


From 1d1d9af254ffc3bc38c59484c50c600d1d0c96da Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 21 Jan 2022 20:58:09 -0800
Subject: perf python: Fix cpu_map__item() building
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Value should be built as an integer.

Switch some uses of perf_cpu_map to use the library API.

Fixes: 6d18804b963b78dc ("perf cpumap: Give CPUs their own type")
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexey Bayduraev <alexey.v.bayduraev@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: André Almeida <andrealmeid@collabora.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Darren Hart <dvhart@infradead.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: German Gomez <german.gomez@arm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Miaoqian Lin <linmq006@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Shunsuke Nakamura <nakamura.shun@fujitsu.com>
Cc: Song Liu <song@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thomas Richter <tmricht@linux.ibm.com>
Cc: Yury Norov <yury.norov@gmail.com>
Link: http://lore.kernel.org/lkml/20220122045811.3402706-2-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/python.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c
index f3e5131f183c..52d8995cfd73 100644
--- a/tools/perf/util/python.c
+++ b/tools/perf/util/python.c
@@ -638,17 +638,17 @@ static Py_ssize_t pyrf_cpu_map__length(PyObject *obj)
 {
 	struct pyrf_cpu_map *pcpus = (void *)obj;
 
-	return pcpus->cpus->nr;
+	return perf_cpu_map__nr(pcpus->cpus);
 }
 
 static PyObject *pyrf_cpu_map__item(PyObject *obj, Py_ssize_t i)
 {
 	struct pyrf_cpu_map *pcpus = (void *)obj;
 
-	if (i >= pcpus->cpus->nr)
+	if (i >= perf_cpu_map__nr(pcpus->cpus))
 		return NULL;
 
-	return Py_BuildValue("i", pcpus->cpus->map[i]);
+	return Py_BuildValue("i", perf_cpu_map__cpu(pcpus->cpus, i).cpu);
 }
 
 static PySequenceMethods pyrf_cpu_map__sequence_methods = {
-- 
cgit v1.2.3


From 440286993960bea4aa09d912a5497d92d09ae54c Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 21 Jan 2022 20:58:10 -0800
Subject: perf cpumap: Migrate to libperf cpumap api
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Switch from directly accessing the perf_cpu_map to using the appropriate
libperf API when possible. Using the API simplifies the job of
refactoring use of perf_cpu_map.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexey Bayduraev <alexey.v.bayduraev@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: André Almeida <andrealmeid@collabora.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Darren Hart <dvhart@infradead.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: German Gomez <german.gomez@arm.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Miaoqian Lin <linmq006@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Riccardo Mancini <rickyman7@gmail.com>
Cc: Shunsuke Nakamura <nakamura.shun@fujitsu.com>
Cc: Song Liu <song@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thomas Richter <tmricht@linux.ibm.com>
Cc: Yury Norov <yury.norov@gmail.com>
Link: http://lore.kernel.org/lkml/20220122045811.3402706-3-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/perf/evsel.c                             |  4 +--
 tools/perf/bench/epoll-ctl.c                       |  2 +-
 tools/perf/bench/epoll-wait.c                      |  2 +-
 tools/perf/bench/evlist-open-close.c               |  4 +--
 tools/perf/bench/futex-hash.c                      |  2 +-
 tools/perf/bench/futex-lock-pi.c                   |  2 +-
 tools/perf/bench/futex-requeue.c                   |  2 +-
 tools/perf/bench/futex-wake-parallel.c             |  2 +-
 tools/perf/bench/futex-wake.c                      |  2 +-
 tools/perf/builtin-ftrace.c                        |  2 +-
 tools/perf/builtin-stat.c                          |  7 ++--
 tools/perf/tests/bitmap.c                          |  4 +--
 tools/perf/tests/event_update.c                    |  8 ++---
 tools/perf/tests/mem2node.c                        |  9 +++---
 tools/perf/tests/mmap-basic.c                      |  5 +--
 tools/perf/tests/topology.c                        | 37 ++++++++++++----------
 tools/perf/util/auxtrace.c                         |  2 +-
 tools/perf/util/counts.c                           |  2 +-
 tools/perf/util/cpumap.h                           |  2 +-
 tools/perf/util/cputopo.c                          |  4 +--
 tools/perf/util/evlist-hybrid.c                    | 11 ++++---
 tools/perf/util/evsel.c                            | 20 ++++++------
 tools/perf/util/evsel.h                            |  3 +-
 tools/perf/util/mmap.c                             |  2 +-
 tools/perf/util/perf_api_probe.c                   |  4 +--
 tools/perf/util/record.c                           |  6 ++--
 .../util/scripting-engines/trace-event-python.c    |  4 +--
 tools/perf/util/session.c                          |  4 +--
 tools/perf/util/svghelper.c                        |  4 +--
 tools/perf/util/synthetic-events.c                 | 18 +++++------
 tools/perf/util/top.c                              |  6 ++--
 31 files changed, 99 insertions(+), 87 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/perf/evsel.c b/tools/lib/perf/evsel.c
index 7ea86a44eae5..210ea7c06ce8 100644
--- a/tools/lib/perf/evsel.c
+++ b/tools/lib/perf/evsel.c
@@ -141,7 +141,7 @@ int perf_evsel__open(struct perf_evsel *evsel, struct perf_cpu_map *cpus,
 	}
 
 	if (evsel->fd == NULL &&
-	    perf_evsel__alloc_fd(evsel, cpus->nr, threads->nr) < 0)
+	    perf_evsel__alloc_fd(evsel, perf_cpu_map__nr(cpus), threads->nr) < 0)
 		return -ENOMEM;
 
 	perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
@@ -384,7 +384,7 @@ int perf_evsel__apply_filter(struct perf_evsel *evsel, const char *filter)
 {
 	int err = 0, i;
 
-	for (i = 0; i < evsel->cpus->nr && !err; i++)
+	for (i = 0; i < perf_cpu_map__nr(evsel->cpus) && !err; i++)
 		err = perf_evsel__run_ioctl(evsel,
 				     PERF_EVENT_IOC_SET_FILTER,
 				     (void *)filter, i);
diff --git a/tools/perf/bench/epoll-ctl.c b/tools/perf/bench/epoll-ctl.c
index 1a17ec83d3c4..740ae764537e 100644
--- a/tools/perf/bench/epoll-ctl.c
+++ b/tools/perf/bench/epoll-ctl.c
@@ -333,7 +333,7 @@ int bench_epoll_ctl(int argc, const char **argv)
 
 	/* default to the number of CPUs */
 	if (!nthreads)
-		nthreads = cpu->nr;
+		nthreads = perf_cpu_map__nr(cpu);
 
 	worker = calloc(nthreads, sizeof(*worker));
 	if (!worker)
diff --git a/tools/perf/bench/epoll-wait.c b/tools/perf/bench/epoll-wait.c
index 0d1dd8879197..37de970c9743 100644
--- a/tools/perf/bench/epoll-wait.c
+++ b/tools/perf/bench/epoll-wait.c
@@ -452,7 +452,7 @@ int bench_epoll_wait(int argc, const char **argv)
 
 	/* default to the number of CPUs and leave one for the writer pthread */
 	if (!nthreads)
-		nthreads = cpu->nr - 1;
+		nthreads = perf_cpu_map__nr(cpu) - 1;
 
 	worker = calloc(nthreads, sizeof(*worker));
 	if (!worker) {
diff --git a/tools/perf/bench/evlist-open-close.c b/tools/perf/bench/evlist-open-close.c
index 482738e9bdad..de56601f69ee 100644
--- a/tools/perf/bench/evlist-open-close.c
+++ b/tools/perf/bench/evlist-open-close.c
@@ -71,7 +71,7 @@ static int evlist__count_evsel_fds(struct evlist *evlist)
 	int cnt = 0;
 
 	evlist__for_each_entry(evlist, evsel)
-		cnt += evsel->core.threads->nr * evsel->core.cpus->nr;
+		cnt += evsel->core.threads->nr * perf_cpu_map__nr(evsel->core.cpus);
 
 	return cnt;
 }
@@ -151,7 +151,7 @@ static int bench_evlist_open_close__run(char *evstr)
 
 	init_stats(&time_stats);
 
-	printf("  Number of cpus:\t%d\n", evlist->core.cpus->nr);
+	printf("  Number of cpus:\t%d\n", perf_cpu_map__nr(evlist->core.cpus));
 	printf("  Number of threads:\t%d\n", evlist->core.threads->nr);
 	printf("  Number of events:\t%d (%d fds)\n",
 		evlist->core.nr_entries, evlist__count_evsel_fds(evlist));
diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c
index 9627b6ab8670..dbcecec4eeda 100644
--- a/tools/perf/bench/futex-hash.c
+++ b/tools/perf/bench/futex-hash.c
@@ -150,7 +150,7 @@ int bench_futex_hash(int argc, const char **argv)
 	}
 
 	if (!params.nthreads) /* default to the number of CPUs */
-		params.nthreads = cpu->nr;
+		params.nthreads = perf_cpu_map__nr(cpu);
 
 	worker = calloc(params.nthreads, sizeof(*worker));
 	if (!worker)
diff --git a/tools/perf/bench/futex-lock-pi.c b/tools/perf/bench/futex-lock-pi.c
index a512a320df74..6fc9a3d55c1f 100644
--- a/tools/perf/bench/futex-lock-pi.c
+++ b/tools/perf/bench/futex-lock-pi.c
@@ -173,7 +173,7 @@ int bench_futex_lock_pi(int argc, const char **argv)
 	}
 
 	if (!params.nthreads)
-		params.nthreads = cpu->nr;
+		params.nthreads = perf_cpu_map__nr(cpu);
 
 	worker = calloc(params.nthreads, sizeof(*worker));
 	if (!worker)
diff --git a/tools/perf/bench/futex-requeue.c b/tools/perf/bench/futex-requeue.c
index aca47ce8b1e7..2f59d5d1c509 100644
--- a/tools/perf/bench/futex-requeue.c
+++ b/tools/perf/bench/futex-requeue.c
@@ -175,7 +175,7 @@ int bench_futex_requeue(int argc, const char **argv)
 	}
 
 	if (!params.nthreads)
-		params.nthreads = cpu->nr;
+		params.nthreads = perf_cpu_map__nr(cpu);
 
 	worker = calloc(params.nthreads, sizeof(*worker));
 	if (!worker)
diff --git a/tools/perf/bench/futex-wake-parallel.c b/tools/perf/bench/futex-wake-parallel.c
index 888ee6037945..861deb934745 100644
--- a/tools/perf/bench/futex-wake-parallel.c
+++ b/tools/perf/bench/futex-wake-parallel.c
@@ -252,7 +252,7 @@ int bench_futex_wake_parallel(int argc, const char **argv)
 		err(EXIT_FAILURE, "calloc");
 
 	if (!params.nthreads)
-		params.nthreads = cpu->nr;
+		params.nthreads = perf_cpu_map__nr(cpu);
 
 	/* some sanity checks */
 	if (params.nwakes > params.nthreads ||
diff --git a/tools/perf/bench/futex-wake.c b/tools/perf/bench/futex-wake.c
index aa82db51c0ab..cfda48bef1d7 100644
--- a/tools/perf/bench/futex-wake.c
+++ b/tools/perf/bench/futex-wake.c
@@ -151,7 +151,7 @@ int bench_futex_wake(int argc, const char **argv)
 	}
 
 	if (!params.nthreads)
-		params.nthreads = cpu->nr;
+		params.nthreads = perf_cpu_map__nr(cpu);
 
 	worker = calloc(params.nthreads, sizeof(*worker));
 	if (!worker)
diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c
index 71452599f87d..dec24dc0e767 100644
--- a/tools/perf/builtin-ftrace.c
+++ b/tools/perf/builtin-ftrace.c
@@ -281,7 +281,7 @@ static int set_tracing_cpumask(struct perf_cpu_map *cpumap)
 	int ret;
 	int last_cpu;
 
-	last_cpu = perf_cpu_map__cpu(cpumap, cpumap->nr - 1).cpu;
+	last_cpu = perf_cpu_map__cpu(cpumap, perf_cpu_map__nr(cpumap) - 1).cpu;
 	mask_size = last_cpu / 4 + 2; /* one more byte for EOS */
 	mask_size += last_cpu / 32; /* ',' is needed for every 32th cpus */
 
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 934e992c966f..3f98689dd687 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -230,11 +230,12 @@ static bool cpus_map_matched(struct evsel *a, struct evsel *b)
 	if (!a->core.cpus || !b->core.cpus)
 		return false;
 
-	if (a->core.cpus->nr != b->core.cpus->nr)
+	if (perf_cpu_map__nr(a->core.cpus) != perf_cpu_map__nr(b->core.cpus))
 		return false;
 
-	for (int i = 0; i < a->core.cpus->nr; i++) {
-		if (a->core.cpus->map[i].cpu != b->core.cpus->map[i].cpu)
+	for (int i = 0; i < perf_cpu_map__nr(a->core.cpus); i++) {
+		if (perf_cpu_map__cpu(a->core.cpus, i).cpu !=
+		    perf_cpu_map__cpu(b->core.cpus, i).cpu)
 			return false;
 	}
 
diff --git a/tools/perf/tests/bitmap.c b/tools/perf/tests/bitmap.c
index 0bf399c49849..4965dd666956 100644
--- a/tools/perf/tests/bitmap.c
+++ b/tools/perf/tests/bitmap.c
@@ -17,8 +17,8 @@ static unsigned long *get_bitmap(const char *str, int nbits)
 	bm = bitmap_zalloc(nbits);
 
 	if (map && bm) {
-		for (i = 0; i < map->nr; i++)
-			set_bit(map->map[i].cpu, bm);
+		for (i = 0; i < perf_cpu_map__nr(map); i++)
+			set_bit(perf_cpu_map__cpu(map, i).cpu, bm);
 	}
 
 	if (map)
diff --git a/tools/perf/tests/event_update.c b/tools/perf/tests/event_update.c
index 16b6d6f47f38..78db4d704e76 100644
--- a/tools/perf/tests/event_update.c
+++ b/tools/perf/tests/event_update.c
@@ -75,10 +75,10 @@ static int process_event_cpus(struct perf_tool *tool __maybe_unused,
 
 	TEST_ASSERT_VAL("wrong id", ev->id == 123);
 	TEST_ASSERT_VAL("wrong type", ev->type == PERF_EVENT_UPDATE__CPUS);
-	TEST_ASSERT_VAL("wrong cpus", map->nr == 3);
-	TEST_ASSERT_VAL("wrong cpus", map->map[0].cpu == 1);
-	TEST_ASSERT_VAL("wrong cpus", map->map[1].cpu == 2);
-	TEST_ASSERT_VAL("wrong cpus", map->map[2].cpu == 3);
+	TEST_ASSERT_VAL("wrong cpus", perf_cpu_map__nr(map) == 3);
+	TEST_ASSERT_VAL("wrong cpus", perf_cpu_map__cpu(map, 0).cpu == 1);
+	TEST_ASSERT_VAL("wrong cpus", perf_cpu_map__cpu(map, 1).cpu == 2);
+	TEST_ASSERT_VAL("wrong cpus", perf_cpu_map__cpu(map, 2).cpu == 3);
 	perf_cpu_map__put(map);
 	return 0;
 }
diff --git a/tools/perf/tests/mem2node.c b/tools/perf/tests/mem2node.c
index f4a4aba33f76..4c96829510c9 100644
--- a/tools/perf/tests/mem2node.c
+++ b/tools/perf/tests/mem2node.c
@@ -25,14 +25,15 @@ static unsigned long *get_bitmap(const char *str, int nbits)
 {
 	struct perf_cpu_map *map = perf_cpu_map__new(str);
 	unsigned long *bm = NULL;
-	int i;
 
 	bm = bitmap_zalloc(nbits);
 
 	if (map && bm) {
-		for (i = 0; i < map->nr; i++) {
-			set_bit(map->map[i].cpu, bm);
-		}
+		struct perf_cpu cpu;
+		int i;
+
+		perf_cpu_map__for_each_cpu(cpu, i, map)
+			set_bit(cpu.cpu, bm);
 	}
 
 	if (map)
diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c
index 0ad62914b4d7..c3c17600f29c 100644
--- a/tools/perf/tests/mmap-basic.c
+++ b/tools/perf/tests/mmap-basic.c
@@ -59,11 +59,12 @@ static int test__basic_mmap(struct test_suite *test __maybe_unused, int subtest
 	}
 
 	CPU_ZERO(&cpu_set);
-	CPU_SET(cpus->map[0].cpu, &cpu_set);
+	CPU_SET(perf_cpu_map__cpu(cpus, 0).cpu, &cpu_set);
 	sched_setaffinity(0, sizeof(cpu_set), &cpu_set);
 	if (sched_setaffinity(0, sizeof(cpu_set), &cpu_set) < 0) {
 		pr_debug("sched_setaffinity() failed on CPU %d: %s ",
-			 cpus->map[0].cpu, str_error_r(errno, sbuf, sizeof(sbuf)));
+			 perf_cpu_map__cpu(cpus, 0).cpu,
+			 str_error_r(errno, sbuf, sizeof(sbuf)));
 		goto out_free_cpus;
 	}
 
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index c4ef0c7002f1..ee1e3dcbc0bd 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -122,44 +122,48 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map)
 	}
 
 	// Test that CPU ID contains socket, die, core and CPU
-	for (i = 0; i < map->nr; i++) {
+	for (i = 0; i < perf_cpu_map__nr(map); i++) {
 		id = aggr_cpu_id__cpu(perf_cpu_map__cpu(map, i), NULL);
-		TEST_ASSERT_VAL("Cpu map - CPU ID doesn't match", map->map[i].cpu == id.cpu.cpu);
+		TEST_ASSERT_VAL("Cpu map - CPU ID doesn't match",
+				perf_cpu_map__cpu(map, i).cpu == id.cpu.cpu);
 
 		TEST_ASSERT_VAL("Cpu map - Core ID doesn't match",
-			session->header.env.cpu[map->map[i].cpu].core_id == id.core);
+			session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].core_id == id.core);
 		TEST_ASSERT_VAL("Cpu map - Socket ID doesn't match",
-			session->header.env.cpu[map->map[i].cpu].socket_id == id.socket);
+			session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].socket_id ==
+			id.socket);
 
 		TEST_ASSERT_VAL("Cpu map - Die ID doesn't match",
-			session->header.env.cpu[map->map[i].cpu].die_id == id.die);
+			session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].die_id == id.die);
 		TEST_ASSERT_VAL("Cpu map - Node ID is set", id.node == -1);
 		TEST_ASSERT_VAL("Cpu map - Thread is set", id.thread == -1);
 	}
 
 	// Test that core ID contains socket, die and core
-	for (i = 0; i < map->nr; i++) {
+	for (i = 0; i < perf_cpu_map__nr(map); i++) {
 		id = aggr_cpu_id__core(perf_cpu_map__cpu(map, i), NULL);
 		TEST_ASSERT_VAL("Core map - Core ID doesn't match",
-			session->header.env.cpu[map->map[i].cpu].core_id == id.core);
+			session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].core_id == id.core);
 
 		TEST_ASSERT_VAL("Core map - Socket ID doesn't match",
-			session->header.env.cpu[map->map[i].cpu].socket_id == id.socket);
+			session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].socket_id ==
+			id.socket);
 
 		TEST_ASSERT_VAL("Core map - Die ID doesn't match",
-			session->header.env.cpu[map->map[i].cpu].die_id == id.die);
+			session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].die_id == id.die);
 		TEST_ASSERT_VAL("Core map - Node ID is set", id.node == -1);
 		TEST_ASSERT_VAL("Core map - Thread is set", id.thread == -1);
 	}
 
 	// Test that die ID contains socket and die
-	for (i = 0; i < map->nr; i++) {
+	for (i = 0; i < perf_cpu_map__nr(map); i++) {
 		id = aggr_cpu_id__die(perf_cpu_map__cpu(map, i), NULL);
 		TEST_ASSERT_VAL("Die map - Socket ID doesn't match",
-			session->header.env.cpu[map->map[i].cpu].socket_id == id.socket);
+			session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].socket_id ==
+			id.socket);
 
 		TEST_ASSERT_VAL("Die map - Die ID doesn't match",
-			session->header.env.cpu[map->map[i].cpu].die_id == id.die);
+			session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].die_id == id.die);
 
 		TEST_ASSERT_VAL("Die map - Node ID is set", id.node == -1);
 		TEST_ASSERT_VAL("Die map - Core is set", id.core == -1);
@@ -168,10 +172,11 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map)
 	}
 
 	// Test that socket ID contains only socket
-	for (i = 0; i < map->nr; i++) {
+	for (i = 0; i < perf_cpu_map__nr(map); i++) {
 		id = aggr_cpu_id__socket(perf_cpu_map__cpu(map, i), NULL);
 		TEST_ASSERT_VAL("Socket map - Socket ID doesn't match",
-			session->header.env.cpu[map->map[i].cpu].socket_id == id.socket);
+			session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].socket_id ==
+			id.socket);
 
 		TEST_ASSERT_VAL("Socket map - Node ID is set", id.node == -1);
 		TEST_ASSERT_VAL("Socket map - Die ID is set", id.die == -1);
@@ -181,10 +186,10 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map)
 	}
 
 	// Test that node ID contains only node
-	for (i = 0; i < map->nr; i++) {
+	for (i = 0; i < perf_cpu_map__nr(map); i++) {
 		id = aggr_cpu_id__node(perf_cpu_map__cpu(map, i), NULL);
 		TEST_ASSERT_VAL("Node map - Node ID doesn't match",
-			cpu__get_node(map->map[i]) == id.node);
+				cpu__get_node(perf_cpu_map__cpu(map, i)) == id.node);
 		TEST_ASSERT_VAL("Node map - Socket is set", id.socket == -1);
 		TEST_ASSERT_VAL("Node map - Die ID is set", id.die == -1);
 		TEST_ASSERT_VAL("Node map - Core is set", id.core == -1);
diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c
index 5632efc44738..825336304a37 100644
--- a/tools/perf/util/auxtrace.c
+++ b/tools/perf/util/auxtrace.c
@@ -174,7 +174,7 @@ void auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp,
 	mp->idx = idx;
 
 	if (per_cpu) {
-		mp->cpu = evlist->core.cpus->map[idx];
+		mp->cpu = perf_cpu_map__cpu(evlist->core.cpus, idx);
 		if (evlist->core.threads)
 			mp->tid = perf_thread_map__pid(evlist->core.threads, 0);
 		else
diff --git a/tools/perf/util/counts.c b/tools/perf/util/counts.c
index 2b81707b9dba..7a447d918458 100644
--- a/tools/perf/util/counts.c
+++ b/tools/perf/util/counts.c
@@ -61,7 +61,7 @@ int evsel__alloc_counts(struct evsel *evsel)
 	struct perf_cpu_map *cpus = evsel__cpus(evsel);
 	int nthreads = perf_thread_map__nr(evsel->core.threads);
 
-	evsel->counts = perf_counts__new(cpus ? cpus->nr : 1, nthreads);
+	evsel->counts = perf_counts__new(perf_cpu_map__nr(cpus), nthreads);
 	return evsel->counts != NULL ? 0 : -ENOMEM;
 }
 
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 0d3c2006a15d..5c85fbd709b4 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -57,7 +57,7 @@ struct perf_cpu cpu__max_present_cpu(void);
  */
 static inline bool cpu_map__is_dummy(struct perf_cpu_map *cpus)
 {
-	return cpus->nr == 1 && cpus->map[0].cpu == -1;
+	return perf_cpu_map__nr(cpus) == 1 && perf_cpu_map__cpu(cpus, 0).cpu == -1;
 }
 
 /**
diff --git a/tools/perf/util/cputopo.c b/tools/perf/util/cputopo.c
index e20b835a1194..d275d843c155 100644
--- a/tools/perf/util/cputopo.c
+++ b/tools/perf/util/cputopo.c
@@ -325,7 +325,7 @@ struct numa_topology *numa_topology__new(void)
 	if (!node_map)
 		goto out;
 
-	nr = (u32) node_map->nr;
+	nr = (u32) perf_cpu_map__nr(node_map);
 
 	tp = zalloc(sizeof(*tp) + sizeof(tp->nodes[0])*nr);
 	if (!tp)
@@ -334,7 +334,7 @@ struct numa_topology *numa_topology__new(void)
 	tp->nr = nr;
 
 	for (i = 0; i < nr; i++) {
-		if (load_numa_node(&tp->nodes[i], node_map->map[i].cpu)) {
+		if (load_numa_node(&tp->nodes[i], perf_cpu_map__cpu(node_map, i).cpu)) {
 			numa_topology__delete(tp);
 			tp = NULL;
 			break;
diff --git a/tools/perf/util/evlist-hybrid.c b/tools/perf/util/evlist-hybrid.c
index 7c554234b43d..7f234215147d 100644
--- a/tools/perf/util/evlist-hybrid.c
+++ b/tools/perf/util/evlist-hybrid.c
@@ -124,22 +124,23 @@ int evlist__fix_hybrid_cpus(struct evlist *evlist, const char *cpu_list)
 
 		events_nr++;
 
-		if (matched_cpus->nr > 0 && (unmatched_cpus->nr > 0 ||
-		    matched_cpus->nr < cpus->nr ||
-		    matched_cpus->nr < pmu->cpus->nr)) {
+		if (perf_cpu_map__nr(matched_cpus) > 0 &&
+		    (perf_cpu_map__nr(unmatched_cpus) > 0 ||
+		     perf_cpu_map__nr(matched_cpus) < perf_cpu_map__nr(cpus) ||
+		     perf_cpu_map__nr(matched_cpus) < perf_cpu_map__nr(pmu->cpus))) {
 			perf_cpu_map__put(evsel->core.cpus);
 			perf_cpu_map__put(evsel->core.own_cpus);
 			evsel->core.cpus = perf_cpu_map__get(matched_cpus);
 			evsel->core.own_cpus = perf_cpu_map__get(matched_cpus);
 
-			if (unmatched_cpus->nr > 0) {
+			if (perf_cpu_map__nr(unmatched_cpus) > 0) {
 				cpu_map__snprint(matched_cpus, buf1, sizeof(buf1));
 				pr_warning("WARNING: use %s in '%s' for '%s', skip other cpus in list.\n",
 					   buf1, pmu->name, evsel->name);
 			}
 		}
 
-		if (matched_cpus->nr == 0) {
+		if (perf_cpu_map__nr(matched_cpus) == 0) {
 			evlist__remove(evlist, evsel);
 			evsel__delete(evsel);
 
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 2f6b18af49e5..fb0a2debf015 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1782,7 +1782,7 @@ static int __evsel__prepare_open(struct evsel *evsel, struct perf_cpu_map *cpus,
 		nthreads = threads->nr;
 
 	if (evsel->core.fd == NULL &&
-	    perf_evsel__alloc_fd(&evsel->core, cpus->nr, nthreads) < 0)
+	    perf_evsel__alloc_fd(&evsel->core, perf_cpu_map__nr(cpus), nthreads) < 0)
 		return -ENOMEM;
 
 	evsel->open_flags = PERF_FLAG_FD_CLOEXEC;
@@ -2020,9 +2020,10 @@ retry_open:
 			test_attr__ready();
 
 			pr_debug2_peo("sys_perf_event_open: pid %d  cpu %d  group_fd %d  flags %#lx",
-				pid, cpus->map[idx].cpu, group_fd, evsel->open_flags);
+				pid, perf_cpu_map__cpu(cpus, idx).cpu, group_fd, evsel->open_flags);
 
-			fd = sys_perf_event_open(&evsel->core.attr, pid, cpus->map[idx].cpu,
+			fd = sys_perf_event_open(&evsel->core.attr, pid,
+						perf_cpu_map__cpu(cpus, idx).cpu,
 						group_fd, evsel->open_flags);
 
 			FD(evsel, idx, thread) = fd;
@@ -2038,7 +2039,8 @@ retry_open:
 			bpf_counter__install_pe(evsel, idx, fd);
 
 			if (unlikely(test_attr__enabled)) {
-				test_attr__open(&evsel->core.attr, pid, cpus->map[idx],
+				test_attr__open(&evsel->core.attr, pid,
+						perf_cpu_map__cpu(cpus, idx),
 						fd, group_fd, evsel->open_flags);
 			}
 
@@ -2079,7 +2081,8 @@ try_fallback:
 	if (evsel__precise_ip_fallback(evsel))
 		goto retry_open;
 
-	if (evsel__ignore_missing_thread(evsel, cpus->nr, idx, threads, thread, err)) {
+	if (evsel__ignore_missing_thread(evsel, perf_cpu_map__nr(cpus),
+					 idx, threads, thread, err)) {
 		/* We just removed 1 thread, so lower the upper nthreads limit. */
 		nthreads--;
 
@@ -2119,7 +2122,7 @@ out_close:
 int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus,
 		struct perf_thread_map *threads)
 {
-	return evsel__open_cpu(evsel, cpus, threads, 0, cpus ? cpus->nr : 1);
+	return evsel__open_cpu(evsel, cpus, threads, 0, perf_cpu_map__nr(cpus));
 }
 
 void evsel__close(struct evsel *evsel)
@@ -2131,8 +2134,7 @@ void evsel__close(struct evsel *evsel)
 int evsel__open_per_cpu(struct evsel *evsel, struct perf_cpu_map *cpus, int cpu_map_idx)
 {
 	if (cpu_map_idx == -1)
-		return evsel__open_cpu(evsel, cpus, NULL, 0,
-					cpus ? cpus->nr : 1);
+		return evsel__open_cpu(evsel, cpus, NULL, 0, perf_cpu_map__nr(cpus));
 
 	return evsel__open_cpu(evsel, cpus, NULL, cpu_map_idx, cpu_map_idx + 1);
 }
@@ -2982,7 +2984,7 @@ int evsel__store_ids(struct evsel *evsel, struct evlist *evlist)
 	struct perf_cpu_map *cpus = evsel->core.cpus;
 	struct perf_thread_map *threads = evsel->core.threads;
 
-	if (perf_evsel__alloc_id(&evsel->core, cpus->nr, threads->nr))
+	if (perf_evsel__alloc_id(&evsel->core, perf_cpu_map__nr(cpus), threads->nr))
 		return -ENOMEM;
 
 	return store_evsel_ids(evsel, evlist);
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 5720ceebffac..041b42d33bf5 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -11,6 +11,7 @@
 #include <perf/evsel.h>
 #include "symbol_conf.h"
 #include <internal/cpumap.h>
+#include <perf/cpumap.h>
 
 struct bpf_object;
 struct cgroup;
@@ -191,7 +192,7 @@ static inline struct perf_cpu_map *evsel__cpus(struct evsel *evsel)
 
 static inline int evsel__nr_cpus(struct evsel *evsel)
 {
-	return evsel__cpus(evsel)->nr;
+	return perf_cpu_map__nr(evsel__cpus(evsel));
 }
 
 void evsel__compute_deltas(struct evsel *evsel, int cpu, int thread,
diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
index 12261ed8c15b..0e8ff8d1e206 100644
--- a/tools/perf/util/mmap.c
+++ b/tools/perf/util/mmap.c
@@ -250,7 +250,7 @@ static void build_node_mask(int node, struct mmap_cpu_mask *mask)
 
 	nr_cpus = perf_cpu_map__nr(cpu_map);
 	for (idx = 0; idx < nr_cpus; idx++) {
-		cpu = cpu_map->map[idx]; /* map c index to online cpu index */
+		cpu = perf_cpu_map__cpu(cpu_map, idx); /* map c index to online cpu index */
 		if (cpu__get_node(cpu) == node)
 			set_bit(cpu.cpu, mask->bits);
 	}
diff --git a/tools/perf/util/perf_api_probe.c b/tools/perf/util/perf_api_probe.c
index 734d006d9a8c..c28dd50bd571 100644
--- a/tools/perf/util/perf_api_probe.c
+++ b/tools/perf/util/perf_api_probe.c
@@ -67,7 +67,7 @@ static bool perf_probe_api(setup_probe_fn_t fn)
 	cpus = perf_cpu_map__new(NULL);
 	if (!cpus)
 		return false;
-	cpu = cpus->map[0];
+	cpu = perf_cpu_map__cpu(cpus, 0);
 	perf_cpu_map__put(cpus);
 
 	do {
@@ -144,7 +144,7 @@ bool perf_can_record_cpu_wide(void)
 	if (!cpus)
 		return false;
 
-	cpu = cpus->map[0];
+	cpu = perf_cpu_map__cpu(cpus, 0);
 	perf_cpu_map__put(cpus);
 
 	fd = sys_perf_event_open(&attr, -1, cpu.cpu, -1, 0);
diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c
index 20461f174991..007a64681416 100644
--- a/tools/perf/util/record.c
+++ b/tools/perf/util/record.c
@@ -106,7 +106,7 @@ void evlist__config(struct evlist *evlist, struct record_opts *opts, struct call
 	if (opts->group)
 		evlist__set_leader(evlist);
 
-	if (evlist->core.cpus->map[0].cpu < 0)
+	if (perf_cpu_map__cpu(evlist->core.cpus, 0).cpu < 0)
 		opts->no_inherit = true;
 
 	use_comm_exec = perf_can_comm_exec();
@@ -248,11 +248,11 @@ bool evlist__can_select_event(struct evlist *evlist, const char *str)
 		struct perf_cpu_map *cpus = perf_cpu_map__new(NULL);
 
 		if (cpus)
-			cpu =  cpus->map[0];
+			cpu =  perf_cpu_map__cpu(cpus, 0);
 
 		perf_cpu_map__put(cpus);
 	} else {
-		cpu = evlist->core.cpus->map[0];
+		cpu = perf_cpu_map__cpu(evlist->core.cpus, 0);
 	}
 
 	while (1) {
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index f5ad0e62227a..e752e1f4a5f0 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -1607,8 +1607,8 @@ static void python_process_stat(struct perf_stat_config *config,
 	}
 
 	for (thread = 0; thread < threads->nr; thread++) {
-		for (cpu = 0; cpu < cpus->nr; cpu++) {
-			process_stat(counter, cpus->map[cpu],
+		for (cpu = 0; cpu < perf_cpu_map__nr(cpus); cpu++) {
+			process_stat(counter, perf_cpu_map__cpu(cpus, cpu),
 				     perf_thread_map__pid(threads, thread), tstamp,
 				     perf_counts(counter->counts, cpu, thread));
 		}
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index f19348dddd55..2c0d30f08e78 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -2537,8 +2537,8 @@ int perf_session__cpu_bitmap(struct perf_session *session,
 		return -1;
 	}
 
-	for (i = 0; i < map->nr; i++) {
-		struct perf_cpu cpu = map->map[i];
+	for (i = 0; i < perf_cpu_map__nr(map); i++) {
+		struct perf_cpu cpu = perf_cpu_map__cpu(map, i);
 
 		if (cpu.cpu >= nr_cpus) {
 			pr_err("Requested CPU %d too large. "
diff --git a/tools/perf/util/svghelper.c b/tools/perf/util/svghelper.c
index 4c9f211249db..1e0c731fc539 100644
--- a/tools/perf/util/svghelper.c
+++ b/tools/perf/util/svghelper.c
@@ -734,8 +734,8 @@ static int str_to_bitmap(char *s, cpumask_t *b, int nr_cpus)
 	if (!m)
 		return -1;
 
-	for (i = 0; i < m->nr; i++) {
-		c = m->map[i];
+	for (i = 0; i < perf_cpu_map__nr(m); i++) {
+		c = perf_cpu_map__cpu(m, i);
 		if (c.cpu >= nr_cpus) {
 			ret = -1;
 			break;
diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c
index c9ba8050cc2b..70f095624a0b 100644
--- a/tools/perf/util/synthetic-events.c
+++ b/tools/perf/util/synthetic-events.c
@@ -1186,12 +1186,12 @@ int perf_event__synthesize_thread_map2(struct perf_tool *tool,
 static void synthesize_cpus(struct cpu_map_entries *cpus,
 			    struct perf_cpu_map *map)
 {
-	int i;
+	int i, map_nr = perf_cpu_map__nr(map);
 
-	cpus->nr = map->nr;
+	cpus->nr = map_nr;
 
-	for (i = 0; i < map->nr; i++)
-		cpus->cpu[i] = map->map[i].cpu;
+	for (i = 0; i < map_nr; i++)
+		cpus->cpu[i] = perf_cpu_map__cpu(map, i).cpu;
 }
 
 static void synthesize_mask(struct perf_record_record_cpu_map *mask,
@@ -1202,13 +1202,13 @@ static void synthesize_mask(struct perf_record_record_cpu_map *mask,
 	mask->nr = BITS_TO_LONGS(max);
 	mask->long_size = sizeof(long);
 
-	for (i = 0; i < map->nr; i++)
-		set_bit(map->map[i].cpu, mask->mask);
+	for (i = 0; i < perf_cpu_map__nr(map); i++)
+		set_bit(perf_cpu_map__cpu(map, i).cpu, mask->mask);
 }
 
 static size_t cpus_size(struct perf_cpu_map *map)
 {
-	return sizeof(struct cpu_map_entries) + map->nr * sizeof(u16);
+	return sizeof(struct cpu_map_entries) + perf_cpu_map__nr(map) * sizeof(u16);
 }
 
 static size_t mask_size(struct perf_cpu_map *map, int *max)
@@ -1217,9 +1217,9 @@ static size_t mask_size(struct perf_cpu_map *map, int *max)
 
 	*max = 0;
 
-	for (i = 0; i < map->nr; i++) {
+	for (i = 0; i < perf_cpu_map__nr(map); i++) {
 		/* bit position of the cpu is + 1 */
-		int bit = map->map[i].cpu + 1;
+		int bit = perf_cpu_map__cpu(map, i).cpu + 1;
 
 		if (bit > *max)
 			*max = bit;
diff --git a/tools/perf/util/top.c b/tools/perf/util/top.c
index 27945eeb0cb5..c1ebfc5d2e0c 100644
--- a/tools/perf/util/top.c
+++ b/tools/perf/util/top.c
@@ -95,15 +95,15 @@ size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size)
 
 	if (target->cpu_list)
 		ret += SNPRINTF(bf + ret, size - ret, ", CPU%s: %s)",
-				top->evlist->core.cpus->nr > 1 ? "s" : "",
+				perf_cpu_map__nr(top->evlist->core.cpus) > 1 ? "s" : "",
 				target->cpu_list);
 	else {
 		if (target->tid)
 			ret += SNPRINTF(bf + ret, size - ret, ")");
 		else
 			ret += SNPRINTF(bf + ret, size - ret, ", %d CPU%s)",
-					top->evlist->core.cpus->nr,
-					top->evlist->core.cpus->nr > 1 ? "s" : "");
+					perf_cpu_map__nr(top->evlist->core.cpus),
+					perf_cpu_map__nr(top->evlist->core.cpus) > 1 ? "s" : "");
 	}
 
 	perf_top__reset_sample_counters(top);
-- 
cgit v1.2.3


From 24ead7c254b42c4ea252e57bf9928154dc7744e0 Mon Sep 17 00:00:00 2001
From: Lv Ruyi <lv.ruyi@zte.com.cn>
Date: Mon, 17 Jan 2022 08:37:30 +0000
Subject: perf cpumap: Remove duplicate include in cpumap.h

Remove all but the first include of stdbool.h from cpumap.h.

Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: Lv Ruyi <lv.ruyi@zte.com.cn>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20220117083730.863200-1-lv.ruyi@zte.com.cn
Signed-off-by: CGEL ZTE <cgel.zte@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/cpumap.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 5c85fbd709b4..703ae6d3386e 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -4,7 +4,6 @@
 
 #include <stdbool.h>
 #include <stdio.h>
-#include <stdbool.h>
 #include <internal/cpumap.h>
 #include <perf/cpumap.h>
 
-- 
cgit v1.2.3


From 3606c0e1a1050d397ad759a62607e419fd8b0ccb Mon Sep 17 00:00:00 2001
From: German Gomez <german.gomez@arm.com>
Date: Tue, 18 Jan 2022 14:40:54 +0000
Subject: perf evsel: Override attr->sample_period for non-libpfm4 events
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A previous patch preventing "attr->sample_period" values from being
overridden in pfm events changed a related behaviour in arm-spe.

Before said patch:

  perf record -c 10000 -e arm_spe_0// -- sleep 1

Would yield an SPE event with period=10000. After the patch, the period
in "-c 10000" was being ignored because the arm-spe code initializes
sample_period to a non-zero value.

This patch restores the previous behaviour for non-libpfm4 events.

Fixes: ae5dcc8abe31 (“perf record: Prevent override of attr->sample_period for libpfm4 events”)
Reported-by: Chase Conklin <chase.conklin@arm.com>
Signed-off-by: German Gomez <german.gomez@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Yonghong Song <yhs@fb.com>
Cc: bpf@vger.kernel.org
Cc: netdev@vger.kernel.org
Link: http://lore.kernel.org/lkml/20220118144054.2541-1-german.gomez@arm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evsel.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index fb0a2debf015..22d3267ce294 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1064,6 +1064,17 @@ void __weak arch_evsel__fixup_new_cycles(struct perf_event_attr *attr __maybe_un
 {
 }
 
+static void evsel__set_default_freq_period(struct record_opts *opts,
+					   struct perf_event_attr *attr)
+{
+	if (opts->freq) {
+		attr->freq = 1;
+		attr->sample_freq = opts->freq;
+	} else {
+		attr->sample_period = opts->default_interval;
+	}
+}
+
 /*
  * The enable_on_exec/disabled value strategy:
  *
@@ -1130,14 +1141,12 @@ void evsel__config(struct evsel *evsel, struct record_opts *opts,
 	 * We default some events to have a default interval. But keep
 	 * it a weak assumption overridable by the user.
 	 */
-	if (!attr->sample_period) {
-		if (opts->freq) {
-			attr->freq		= 1;
-			attr->sample_freq	= opts->freq;
-		} else {
-			attr->sample_period = opts->default_interval;
-		}
-	}
+	if ((evsel->is_libpfm_event && !attr->sample_period) ||
+	    (!evsel->is_libpfm_event && (!attr->sample_period ||
+					 opts->user_freq != UINT_MAX ||
+					 opts->user_interval != ULLONG_MAX)))
+		evsel__set_default_freq_period(opts, attr);
+
 	/*
 	 * If attr->freq was set (here or earlier), ask for period
 	 * to be sampled.
-- 
cgit v1.2.3


From 864bc8c905261f264c3ea357027cf555fe51c5a3 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Mon, 17 Jan 2022 23:10:13 +0800
Subject: perf parse-events: Support event alias in form foo-bar-baz

Event aliasing for events whose name in the form foo-bar-baz is not
supported, while foo-bar, foo_bar_baz, and other combinations are, i.e.
two hyphens are not supported.

The HiSilicon D06 platform has events in such form:

  $ ./perf list sdir-home-migrate

  List of pre-defined events (to be used in -e):

  uncore hha:
    sdir-home-migrate
   [Unit: hisi_sccl,hha]

  $ sudo ./perf stat -e sdir-home-migrate
  event syntax error: 'sdir-home-migrate'
                          \___ parser error
  Run 'perf list' for a list of valid events

   Usage: perf stat [<options>] [<command>]

   -e, --event <event>event selector. use 'perf list' to list available events

To support, add an extra PMU event symbol type for "baz", and add a new
rule in the bison file.

Signed-off-by: John Garry <john.garry@huawei.com>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qi Liu <liuqi115@huawei.com>
Cc: Shaokun Zhang <zhangshaokun@hisilicon.com>
Cc: linuxarm@huawei.com
Link: https://lore.kernel.org/r/1642432215-234089-2-git-send-email-john.garry@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/parse-events.c | 25 +++++++++++++++++++++++--
 tools/perf/util/parse-events.h |  1 +
 tools/perf/util/parse-events.l |  2 ++
 tools/perf/util/parse-events.y | 17 +++++++++++++++--
 4 files changed, 41 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index acf20ce98ce9..879f606e07e6 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -2098,8 +2098,17 @@ static void perf_pmu__parse_init(void)
 	pmu = NULL;
 	while ((pmu = perf_pmu__scan(pmu)) != NULL) {
 		list_for_each_entry(alias, &pmu->aliases, list) {
-			if (strchr(alias->name, '-'))
+			char *tmp = strchr(alias->name, '-');
+
+			if (tmp) {
+				char *tmp2 = NULL;
+
+				tmp2 = strchr(tmp + 1, '-');
 				len++;
+				if (tmp2)
+					len++;
+			}
+
 			len++;
 		}
 	}
@@ -2119,8 +2128,20 @@ static void perf_pmu__parse_init(void)
 		list_for_each_entry(alias, &pmu->aliases, list) {
 			struct perf_pmu_event_symbol *p = perf_pmu_events_list + len;
 			char *tmp = strchr(alias->name, '-');
+			char *tmp2 = NULL;
 
-			if (tmp != NULL) {
+			if (tmp)
+				tmp2 = strchr(tmp + 1, '-');
+			if (tmp2) {
+				SET_SYMBOL(strndup(alias->name, tmp - alias->name),
+						PMU_EVENT_SYMBOL_PREFIX);
+				p++;
+				tmp++;
+				SET_SYMBOL(strndup(tmp, tmp2 - tmp), PMU_EVENT_SYMBOL_SUFFIX);
+				p++;
+				SET_SYMBOL(strdup(++tmp2), PMU_EVENT_SYMBOL_SUFFIX2);
+				len += 3;
+			} else if (tmp) {
 				SET_SYMBOL(strndup(alias->name, tmp - alias->name),
 						PMU_EVENT_SYMBOL_PREFIX);
 				p++;
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index c7fc93f54577..a38b8b160e80 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -53,6 +53,7 @@ enum perf_pmu_event_symbol_type {
 	PMU_EVENT_SYMBOL,		/* normal style PMU event */
 	PMU_EVENT_SYMBOL_PREFIX,	/* prefix of pre-suf style event */
 	PMU_EVENT_SYMBOL_SUFFIX,	/* suffix of pre-suf style event */
+	PMU_EVENT_SYMBOL_SUFFIX2,	/* suffix of pre-suf2 style event */
 };
 
 struct perf_pmu_event_symbol {
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
index 4efe9872c667..5b6e4b5249cf 100644
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -149,6 +149,8 @@ static int pmu_str_check(yyscan_t scanner, struct parse_events_state *parse_stat
 			return PE_PMU_EVENT_PRE;
 		case PMU_EVENT_SYMBOL_SUFFIX:
 			return PE_PMU_EVENT_SUF;
+		case PMU_EVENT_SYMBOL_SUFFIX2:
+			return PE_PMU_EVENT_SUF2;
 		case PMU_EVENT_SYMBOL:
 			return parse_state->fake_pmu
 				? PE_PMU_EVENT_FAKE : PE_KERNEL_PMU_EVENT;
diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y
index 174158982fae..be8c51770051 100644
--- a/tools/perf/util/parse-events.y
+++ b/tools/perf/util/parse-events.y
@@ -69,7 +69,7 @@ static void inc_group_count(struct list_head *list,
 %token PE_NAME_CACHE_TYPE PE_NAME_CACHE_OP_RESULT
 %token PE_PREFIX_MEM PE_PREFIX_RAW PE_PREFIX_GROUP
 %token PE_ERROR
-%token PE_PMU_EVENT_PRE PE_PMU_EVENT_SUF PE_KERNEL_PMU_EVENT PE_PMU_EVENT_FAKE
+%token PE_PMU_EVENT_PRE PE_PMU_EVENT_SUF PE_PMU_EVENT_SUF2 PE_KERNEL_PMU_EVENT PE_PMU_EVENT_FAKE
 %token PE_ARRAY_ALL PE_ARRAY_RANGE
 %token PE_DRV_CFG_TERM
 %type <num> PE_VALUE
@@ -87,7 +87,7 @@ static void inc_group_count(struct list_head *list,
 %type <str> PE_MODIFIER_EVENT
 %type <str> PE_MODIFIER_BP
 %type <str> PE_EVENT_NAME
-%type <str> PE_PMU_EVENT_PRE PE_PMU_EVENT_SUF PE_KERNEL_PMU_EVENT PE_PMU_EVENT_FAKE
+%type <str> PE_PMU_EVENT_PRE PE_PMU_EVENT_SUF PE_PMU_EVENT_SUF2 PE_KERNEL_PMU_EVENT PE_PMU_EVENT_FAKE
 %type <str> PE_DRV_CFG_TERM
 %type <str> event_pmu_name
 %destructor { free ($$); } <str>
@@ -372,6 +372,19 @@ PE_KERNEL_PMU_EVENT opt_pmu_config
 	$$ = list;
 }
 |
+PE_PMU_EVENT_PRE '-' PE_PMU_EVENT_SUF '-' PE_PMU_EVENT_SUF2 sep_dc
+{
+	struct list_head *list;
+	char pmu_name[128];
+	snprintf(pmu_name, sizeof(pmu_name), "%s-%s-%s", $1, $3, $5);
+	free($1);
+	free($3);
+	free($5);
+	if (parse_events_multi_pmu_add(_parse_state, pmu_name, NULL, &list) < 0)
+		YYABORT;
+	$$ = list;
+}
+|
 PE_PMU_EVENT_PRE '-' PE_PMU_EVENT_SUF sep_dc
 {
 	struct list_head *list;
-- 
cgit v1.2.3


From 34fa67e72085201ea94b5332eae316951331958f Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Mon, 17 Jan 2022 23:10:14 +0800
Subject: perf test: Add pmu-events test for aliases with hyphens

Add a test for aliases with hyphens in the name to ensure that the
pmu-events tables are as expects. There should be no reason why these sort
of aliases would be treated differently, but no harm in checking.

Signed-off-by: John Garry <john.garry@huawei.com>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qi Liu <liuqi115@huawei.com>
Cc: Shaokun Zhang <zhangshaokun@hisilicon.com>
Cc: linuxarm@huawei.com
Link: https://lore.kernel.org/r/1642432215-234089-3-git-send-email-john.garry@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../pmu-events/arch/test/test_soc/cpu/uncore.json  | 16 +++++++++++
 tools/perf/tests/pmu-events.c                      | 32 ++++++++++++++++++++++
 2 files changed, 48 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/arch/test/test_soc/cpu/uncore.json b/tools/perf/pmu-events/arch/test/test_soc/cpu/uncore.json
index 73089c682f80..41bac1c6a008 100644
--- a/tools/perf/pmu-events/arch/test/test_soc/cpu/uncore.json
+++ b/tools/perf/pmu-events/arch/test/test_soc/cpu/uncore.json
@@ -18,6 +18,22 @@
 	    "Invert": "0",
 	    "EdgeDetect": "0"
   },
+  {
+	    "Unit": "CBO",
+	    "EventCode": "0xE0",
+	    "UMask": "0x00",
+	    "EventName": "event-hyphen",
+	    "BriefDescription": "UNC_CBO_HYPHEN",
+	    "PublicDescription": "UNC_CBO_HYPHEN"
+  },
+  {
+	    "Unit": "CBO",
+	    "EventCode": "0xC0",
+	    "UMask": "0x00",
+	    "EventName": "event-two-hyph",
+	    "BriefDescription": "UNC_CBO_TWO_HYPH",
+	    "PublicDescription": "UNC_CBO_TWO_HYPH"
+  },
   {
 	    "EventCode": "0x7",
 	    "EventName": "uncore_hisi_l3c.rd_hit_cpipe",
diff --git a/tools/perf/tests/pmu-events.c b/tools/perf/tests/pmu-events.c
index df1c9a3cc05b..1c695fb5a79c 100644
--- a/tools/perf/tests/pmu-events.c
+++ b/tools/perf/tests/pmu-events.c
@@ -143,6 +143,34 @@ static const struct perf_pmu_test_event unc_cbo_xsnp_response_miss_eviction = {
 	.matching_pmu = "uncore_cbox_0",
 };
 
+static const struct perf_pmu_test_event uncore_hyphen = {
+	.event = {
+		.name = "event-hyphen",
+		.event = "umask=0x00,event=0xe0",
+		.desc = "Unit: uncore_cbox UNC_CBO_HYPHEN",
+		.topic = "uncore",
+		.long_desc = "UNC_CBO_HYPHEN",
+		.pmu = "uncore_cbox",
+	},
+	.alias_str = "umask=0,event=0xe0",
+	.alias_long_desc = "UNC_CBO_HYPHEN",
+	.matching_pmu = "uncore_cbox_0",
+};
+
+static const struct perf_pmu_test_event uncore_two_hyph = {
+	.event = {
+		.name = "event-two-hyph",
+		.event = "umask=0x00,event=0xc0",
+		.desc = "Unit: uncore_cbox UNC_CBO_TWO_HYPH",
+		.topic = "uncore",
+		.long_desc = "UNC_CBO_TWO_HYPH",
+		.pmu = "uncore_cbox",
+	},
+	.alias_str = "umask=0,event=0xc0",
+	.alias_long_desc = "UNC_CBO_TWO_HYPH",
+	.matching_pmu = "uncore_cbox_0",
+};
+
 static const struct perf_pmu_test_event uncore_hisi_l3c_rd_hit_cpipe = {
 	.event = {
 		.name = "uncore_hisi_l3c.rd_hit_cpipe",
@@ -188,6 +216,8 @@ static const struct perf_pmu_test_event uncore_imc_cache_hits = {
 static const struct perf_pmu_test_event *uncore_events[] = {
 	&uncore_hisi_ddrc_flux_wcmd,
 	&unc_cbo_xsnp_response_miss_eviction,
+	&uncore_hyphen,
+	&uncore_two_hyph,
 	&uncore_hisi_l3c_rd_hit_cpipe,
 	&uncore_imc_free_running_cache_miss,
 	&uncore_imc_cache_hits,
@@ -654,6 +684,8 @@ static struct perf_pmu_test_pmu test_pmus[] = {
 		},
 		.aliases = {
 			&unc_cbo_xsnp_response_miss_eviction,
+			&uncore_hyphen,
+			&uncore_two_hyph,
 		},
 	},
 	{
-- 
cgit v1.2.3


From b4a7276c5e9a79c238a2fad4fb9498dd3558ad2e Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Mon, 17 Jan 2022 23:10:15 +0800
Subject: perf test: Add parse-events test for aliases with hyphens

Add a test which allows us to test parsing an event alias with hyphens.

Since these events typically do not exist on most host systems, add the
alias to the fake pmu.

Function perf_pmu__test_parse_init() has terms added to match known test
aliases.

Signed-off-by: John Garry <john.garry@huawei.com>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qi Liu <liuqi115@huawei.com>
Cc: Shaokun Zhang <zhangshaokun@hisilicon.com>
Cc: linuxarm@huawei.com
Link: https://lore.kernel.org/r/1642432215-234089-4-git-send-email-john.garry@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/parse-events.c | 49 +++++++++++++++++++++++++++++++++++++++++
 tools/perf/util/parse-events.c  | 42 +++++++++++++++++++++++++++--------
 2 files changed, 82 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index a508f1dbcb2a..e71efadb24f5 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -2069,6 +2069,31 @@ static int test_event(struct evlist_test *e)
 	return ret;
 }
 
+static int test_event_fake_pmu(const char *str)
+{
+	struct parse_events_error err;
+	struct evlist *evlist;
+	int ret;
+
+	evlist = evlist__new();
+	if (!evlist)
+		return -ENOMEM;
+
+	parse_events_error__init(&err);
+	perf_pmu__test_parse_init();
+	ret = __parse_events(evlist, str, &err, &perf_pmu__fake);
+	if (ret) {
+		pr_debug("failed to parse event '%s', err %d, str '%s'\n",
+			 str, ret, err.str);
+		parse_events_error__print(&err, str);
+	}
+
+	parse_events_error__exit(&err);
+	evlist__delete(evlist);
+
+	return ret;
+}
+
 static int test_events(struct evlist_test *events, unsigned cnt)
 {
 	int ret1, ret2 = 0;
@@ -2276,6 +2301,26 @@ static int test_pmu_events_alias(char *event, char *alias)
 	return test_event(&e);
 }
 
+static int test_pmu_events_alias2(void)
+{
+	static const char events[][30] = {
+			"event-hyphen",
+			"event-two-hyph",
+	};
+	unsigned long i;
+	int ret = 0;
+
+	for (i = 0; i < ARRAY_SIZE(events); i++) {
+		ret = test_event_fake_pmu(&events[i][0]);
+		if (ret) {
+			pr_err("check_parse_fake %s failed\n", &events[i][0]);
+			break;
+		}
+	}
+
+	return ret;
+}
+
 static int test__parse_events(struct test_suite *test __maybe_unused, int subtest __maybe_unused)
 {
 	int ret1, ret2 = 0;
@@ -2313,6 +2358,10 @@ do {							\
 			return ret;
 	}
 
+	ret1 = test_pmu_events_alias2();
+	if (!ret2)
+		ret2 = ret1;
+
 	ret1 = test_terms(test__terms, ARRAY_SIZE(test__terms));
 	if (!ret2)
 		ret2 = ret1;
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 879f606e07e6..9739b05b999e 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -1697,6 +1697,15 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state,
 			}
 		}
 	}
+
+	if (parse_state->fake_pmu) {
+		if (!parse_events_add_pmu(parse_state, list, str, head,
+					  true, true)) {
+			pr_debug("%s -> %s/%s/\n", str, "fake_pmu", str);
+			ok++;
+		}
+	}
+
 out_err:
 	if (ok)
 		*listp = list;
@@ -2168,23 +2177,38 @@ err:
  */
 int perf_pmu__test_parse_init(void)
 {
-	struct perf_pmu_event_symbol *list;
+	struct perf_pmu_event_symbol *list, *tmp, symbols[] = {
+		{(char *)"read", PMU_EVENT_SYMBOL},
+		{(char *)"event", PMU_EVENT_SYMBOL_PREFIX},
+		{(char *)"two", PMU_EVENT_SYMBOL_SUFFIX},
+		{(char *)"hyphen", PMU_EVENT_SYMBOL_SUFFIX},
+		{(char *)"hyph", PMU_EVENT_SYMBOL_SUFFIX2},
+	};
+	unsigned long i, j;
 
-	list = malloc(sizeof(*list) * 1);
+	tmp = list = malloc(sizeof(*list) * ARRAY_SIZE(symbols));
 	if (!list)
 		return -ENOMEM;
 
-	list->type   = PMU_EVENT_SYMBOL;
-	list->symbol = strdup("read");
-
-	if (!list->symbol) {
-		free(list);
-		return -ENOMEM;
+	for (i = 0; i < ARRAY_SIZE(symbols); i++, tmp++) {
+		tmp->type = symbols[i].type;
+		tmp->symbol = strdup(symbols[i].symbol);
+		if (!list->symbol)
+			goto err_free;
 	}
 
 	perf_pmu_events_list = list;
-	perf_pmu_events_list_num = 1;
+	perf_pmu_events_list_num = ARRAY_SIZE(symbols);
+
+	qsort(perf_pmu_events_list, ARRAY_SIZE(symbols),
+	      sizeof(struct perf_pmu_event_symbol), comp_pmu);
 	return 0;
+
+err_free:
+	for (j = 0, tmp = list; j < i; j++, tmp++)
+		free(tmp->symbol);
+	free(list);
+	return -ENOMEM;
 }
 
 enum perf_pmu_event_symbol_type
-- 
cgit v1.2.3


From f0ac5b85810a69104ee6bc939bcbaecfe4db9a3e Mon Sep 17 00:00:00 2001
From: Minghao Chi <chi.minghao@zte.com.cn>
Date: Wed, 12 Jan 2022 08:01:09 +0000
Subject: perf tools: Remove redundant err variable

Return value from perf_event__process_tracing_data() directly instead
of taking this in another redundant variable.

Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: Minghao Chi <chi.minghao@zte.com.cn>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lore.kernel.org/lkml/20220112080109.666800-1-chi.minghao@zte.com.cn
Signed-off-by: CGEL ZTE <cgel.zte@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-inject.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 409b721666cb..fbf43a454cba 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -535,12 +535,9 @@ static int perf_event__repipe_exit(struct perf_tool *tool,
 static int perf_event__repipe_tracing_data(struct perf_session *session,
 					   union perf_event *event)
 {
-	int err;
-
 	perf_event__repipe_synth(session->tool, event);
-	err = perf_event__process_tracing_data(session, event);
 
-	return err;
+	return perf_event__process_tracing_data(session, event);
 }
 
 static int dso__read_build_id(struct dso *dso)
-- 
cgit v1.2.3


From de1956f48543e90f94b1194395f33140898b39b2 Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Thu, 20 Jan 2022 00:38:26 +0000
Subject: KVM: selftests: Re-enable access_tracking_perf_test

This selftest was accidentally removed by commit 6a58150859fd
("selftest: KVM: Add intra host migration tests"). Add it back.

Fixes: 6a58150859fd ("selftest: KVM: Add intra host migration tests")
Signed-off-by: David Matlack <dmatlack@google.com>
Message-Id: <20220120003826.2805036-1-dmatlack@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/Makefile | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 81ebf99d6ff0..0e4926bc9a58 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -85,6 +85,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_pi_mmio_test
 TEST_GEN_PROGS_x86_64 += x86_64/sev_migrate_tests
 TEST_GEN_PROGS_x86_64 += x86_64/amx_test
+TEST_GEN_PROGS_x86_64 += access_tracking_perf_test
 TEST_GEN_PROGS_x86_64 += demand_paging_test
 TEST_GEN_PROGS_x86_64 += dirty_log_test
 TEST_GEN_PROGS_x86_64 += dirty_log_perf_test
-- 
cgit v1.2.3


From 4cf3d3ebe8794c449af3e0e8c1d790c97e461d20 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 25 Jan 2022 22:17:25 +0000
Subject: KVM: selftests: Don't skip L2's VMCALL in SMM test for SVM guest

Don't skip the vmcall() in l2_guest_code() prior to re-entering L2, doing
so will result in L2 running to completion, popping '0' off the stack for
RET, jumping to address '0', and ultimately dying with a triple fault
shutdown.

It's not at all obvious why the test re-enters L2 and re-executes VMCALL,
but presumably it serves a purpose.  The VMX path doesn't skip vmcall(),
and the test can't possibly have passed on SVM, so just do what VMX does.

Fixes: d951b2210c1a ("KVM: selftests: smm_test: Test SMM enter from L2")
Cc: Maxim Levitsky <mlevitsk@redhat.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220125221725.2101126-1-seanjc@google.com>
Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Tested-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/x86_64/smm_test.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c
index 2da8eb8e2d96..a626d40fdb48 100644
--- a/tools/testing/selftests/kvm/x86_64/smm_test.c
+++ b/tools/testing/selftests/kvm/x86_64/smm_test.c
@@ -105,7 +105,6 @@ static void guest_code(void *arg)
 
 		if (cpu_has_svm()) {
 			run_guest(svm->vmcb, svm->vmcb_gpa);
-			svm->vmcb->save.rip += 3;
 			run_guest(svm->vmcb, svm->vmcb_gpa);
 		} else {
 			vmlaunch();
-- 
cgit v1.2.3


From dd4516aee365fc9c944c9d6036b6b87363398680 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 26 Jan 2022 07:44:34 -0500
Subject: selftests: kvm: move vm_xsave_req_perm call to amx_test

There is no need for tests other than amx_test to enable dynamic xsave
states.  Remove the call to vm_xsave_req_perm from generic code,
and move it inside the test.  While at it, allow customizing the bit
that is requested, so that future tests can use it differently.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/include/kvm_util_base.h    |  1 -
 tools/testing/selftests/kvm/include/x86_64/processor.h |  1 +
 tools/testing/selftests/kvm/lib/kvm_util.c             |  7 -------
 tools/testing/selftests/kvm/lib/x86_64/processor.c     | 12 ++++++------
 tools/testing/selftests/kvm/x86_64/amx_test.c          |  2 ++
 5 files changed, 9 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h
index 66775de26952..4ed6aa049a91 100644
--- a/tools/testing/selftests/kvm/include/kvm_util_base.h
+++ b/tools/testing/selftests/kvm/include/kvm_util_base.h
@@ -345,7 +345,6 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus,
  *   guest_code - The vCPU's entry point
  */
 void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code);
-void vm_xsave_req_perm(void);
 
 bool vm_is_unrestricted_guest(struct kvm_vm *vm);
 
diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index 423d8a61bd2e..8a470da7b71a 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -458,6 +458,7 @@ uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
 struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void);
 void vcpu_set_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid);
 struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid);
+void vm_xsave_req_perm(int bit);
 
 enum x86_page_size {
 	X86_PAGE_SIZE_4K = 0,
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 8c53f96ab7fe..d8cf851ab119 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -393,13 +393,6 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus,
 	struct kvm_vm *vm;
 	int i;
 
-#ifdef __x86_64__
-	/*
-	 * Permission needs to be requested before KVM_SET_CPUID2.
-	 */
-	vm_xsave_req_perm();
-#endif
-
 	/* Force slot0 memory size not small than DEFAULT_GUEST_PHY_PAGES */
 	if (slot0_mem_pages < DEFAULT_GUEST_PHY_PAGES)
 		slot0_mem_pages = DEFAULT_GUEST_PHY_PAGES;
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index 5f9d7e91dc69..c1d1c195a838 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -665,16 +665,16 @@ static bool is_xfd_supported(void)
 	return !!(eax & CPUID_XFD_BIT);
 }
 
-void vm_xsave_req_perm(void)
+void vm_xsave_req_perm(int bit)
 {
-	unsigned long bitmask;
+	u64 bitmask;
 	long rc;
 
 	if (!is_xfd_supported())
-		return;
+		exit(KSFT_SKIP);
+
+	rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, bit);
 
-	rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM,
-		     XSTATE_XTILE_DATA_BIT);
 	/*
 	 * The older kernel version(<5.15) can't support
 	 * ARCH_REQ_XCOMP_GUEST_PERM and directly return.
@@ -684,7 +684,7 @@ void vm_xsave_req_perm(void)
 
 	rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &bitmask);
 	TEST_ASSERT(rc == 0, "prctl(ARCH_GET_XCOMP_GUEST_PERM) error: %ld", rc);
-	TEST_ASSERT(bitmask & XFEATURE_XTILE_MASK,
+	TEST_ASSERT(bitmask & (1ULL << bit),
 		    "prctl(ARCH_REQ_XCOMP_GUEST_PERM) failure bitmask=0x%lx",
 		    bitmask);
 }
diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c
index 523c1e99ed64..52a3ef6629e8 100644
--- a/tools/testing/selftests/kvm/x86_64/amx_test.c
+++ b/tools/testing/selftests/kvm/x86_64/amx_test.c
@@ -329,6 +329,8 @@ int main(int argc, char *argv[])
 	u32 amx_offset;
 	int stage, ret;
 
+	vm_xsave_req_perm(XSTATE_XTILE_DATA_BIT);
+
 	/* Create VM */
 	vm = vm_create_default(VCPU_ID, 0, guest_code);
 
-- 
cgit v1.2.3


From 2201aea114d4c7eedd05865df545dbb987cee5c1 Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Tue, 25 Jan 2022 17:13:01 -0700
Subject: rtla: Make doc build optional

rtla build fails due to doc build dependency on rst2man. Make
doc build optional so rtla could be built without docs. Leave
the install dependency on doc_install alone.

Link: https://lkml.kernel.org/r/20220126001301.79096-1-skhan@linuxfoundation.org

Acked-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/tracing/rtla/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile
index 2d52ff0bff7d..7c39728d08de 100644
--- a/tools/tracing/rtla/Makefile
+++ b/tools/tracing/rtla/Makefile
@@ -59,7 +59,7 @@ endif
 .PHONY:	all
 all:	rtla
 
-rtla: $(OBJ) doc
+rtla: $(OBJ)
 	$(CC) -o rtla $(LDFLAGS) $(OBJ) $(LIBS)
 
 static: $(OBJ)
-- 
cgit v1.2.3


From aa814c51ab7cb3ddeeeeedaa37d599aee7ba6649 Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Tue, 25 Jan 2022 17:22:34 -0700
Subject: tools/tracing: Update Makefile to build rtla

Update tracing Makefile to build/install/clean rtla tragets.

Link: https://lkml.kernel.org/r/20220126002234.79337-1-skhan@linuxfoundation.org

Reviewed-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/tracing/Makefile | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/tracing/Makefile b/tools/tracing/Makefile
index 87e0ec48e2e7..95e485f12d97 100644
--- a/tools/tracing/Makefile
+++ b/tools/tracing/Makefile
@@ -1,11 +1,11 @@
 # SPDX-License-Identifier: GPL-2.0
 include ../scripts/Makefile.include
 
-all: latency
+all: latency rtla
 
-clean: latency_clean
+clean: latency_clean rtla_clean
 
-install: latency_install
+install: latency_install rtla_install
 
 latency:
 	$(call descend,latency)
@@ -16,4 +16,14 @@ latency_install:
 latency_clean:
 	$(call descend,latency,clean)
 
-.PHONY: all install clean latency latency_install latency_clean
+rtla:
+	$(call descend,rtla)
+
+rtla_install:
+	$(call descend,rtla,install)
+
+rtla_clean:
+	$(call descend,rtla,clean)
+
+.PHONY: all install clean latency latency_install latency_clean \
+	rtla rtla_install rtla_clean
-- 
cgit v1.2.3


From 50806fd91428a28d5daa649310dd0ca05b39c118 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Mon, 24 Jan 2022 17:55:26 +0000
Subject: kselftest/arm64: Skip VL_INHERIT tests for unsupported vector types

Currently we unconditionally test the ability to set the vector length
inheritance flag via ptrace meaning that we generate false failures on
systems that don't support SVE when we attempt to set the vector length
there. Check the hwcap and mark the tests as skipped when it's not present.

Fixes: 0ba1ce1e8605 ("selftests: arm64: Add coverage of ptrace flags for SVE VL inheritance")
Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
Link: https://lore.kernel.org/r/20220124175527.3260234-2-broonie@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/sve-ptrace.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c
index af798b9d232c..0cf78360c5bc 100644
--- a/tools/testing/selftests/arm64/fp/sve-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c
@@ -557,7 +557,14 @@ static int do_parent(pid_t child)
 		}
 
 		/* prctl() flags */
-		ptrace_set_get_inherit(child, &vec_types[i]);
+		if (getauxval(vec_types[i].hwcap_type) & vec_types[i].hwcap) {
+			ptrace_set_get_inherit(child, &vec_types[i]);
+		} else {
+			ksft_test_result_skip("%s SVE_PT_VL_INHERIT set\n",
+					      vec_types[i].name);
+			ksft_test_result_skip("%s SVE_PT_VL_INHERIT cleared\n",
+					      vec_types[i].name);
+		}
 
 		/* Step through every possible VQ */
 		for (vq = SVE_VQ_MIN; vq <= SVE_VQ_MAX; vq++) {
-- 
cgit v1.2.3


From 9ae279ecabe3e5bb85567e6c7371f4d35cfa00d6 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Mon, 24 Jan 2022 17:55:27 +0000
Subject: kselftest/arm64: Correct logging of FPSIMD register read via ptrace

There's a cut'n'paste error in the logging for our test for reading register
state back via ptrace, correctly say that we did a read instead of a write.

Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
Link: https://lore.kernel.org/r/20220124175527.3260234-3-broonie@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/sve-ptrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c
index 0cf78360c5bc..a3c1e67441f9 100644
--- a/tools/testing/selftests/arm64/fp/sve-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c
@@ -261,7 +261,7 @@ static void ptrace_sve_fpsimd(pid_t child, const struct vec_type *type)
 	}
 
 	ksft_test_result((sve->flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD,
-			 "Set FPSIMD registers via %s\n", type->name);
+			 "Got FPSIMD registers via %s\n", type->name);
 	if ((sve->flags & SVE_PT_REGS_MASK) != SVE_PT_REGS_FPSIMD)
 		goto out;
 
-- 
cgit v1.2.3


From b19c99b9f4486f23e3b7248dd4ce3d83e19b9032 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 26 Jan 2022 07:51:00 -0500
Subject: selftests: kvm: check dynamic bits against KVM_X86_XCOMP_GUEST_SUPP

Provide coverage for the new API.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/arch/x86/include/uapi/asm/kvm.h              |  3 +++
 tools/include/uapi/linux/kvm.h                     |  1 +
 tools/testing/selftests/kvm/lib/x86_64/processor.c | 15 +++++++++++++++
 3 files changed, 19 insertions(+)

(limited to 'tools')

diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
index 2da3316bb559..bf6e96011dfe 100644
--- a/tools/arch/x86/include/uapi/asm/kvm.h
+++ b/tools/arch/x86/include/uapi/asm/kvm.h
@@ -452,6 +452,9 @@ struct kvm_sync_regs {
 
 #define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE	0x00000001
 
+/* attributes for system fd (group 0) */
+#define KVM_X86_XCOMP_GUEST_SUPP	0
+
 struct kvm_vmx_nested_state_data {
 	__u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
 	__u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index 9563d294f181..b46bcdb0cab1 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -1133,6 +1133,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206
 #define KVM_CAP_VM_GPA_BITS 207
 #define KVM_CAP_XSAVE2 208
+#define KVM_CAP_SYS_ATTRIBUTES 209
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index c1d1c195a838..9f000dfb5594 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -667,8 +667,23 @@ static bool is_xfd_supported(void)
 
 void vm_xsave_req_perm(int bit)
 {
+	int kvm_fd;
 	u64 bitmask;
 	long rc;
+	struct kvm_device_attr attr = {
+		.group = 0,
+		.attr = KVM_X86_XCOMP_GUEST_SUPP,
+		.addr = (unsigned long) &bitmask
+	};
+
+	kvm_fd = open_kvm_dev_path_or_exit();
+	rc = ioctl(kvm_fd, KVM_GET_DEVICE_ATTR, &attr);
+	close(kvm_fd);
+	if (rc == -1 && (errno == ENXIO || errno == EINVAL))
+		exit(KSFT_SKIP);
+	TEST_ASSERT(rc == 0, "KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) error: %ld", rc);
+	if (!(bitmask & (1ULL << bit)))
+		exit(KSFT_SKIP);
 
 	if (!is_xfd_supported())
 		exit(KSFT_SKIP);
-- 
cgit v1.2.3


From 0226bd64da52aa23120d1450c37a424387827a21 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@nvidia.com>
Date: Sat, 29 Jan 2022 13:41:07 -0800
Subject: tools/testing/scatterlist: add missing defines
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The cited commits replaced preemptible with pagefault_disabled and
flush_kernel_dcache_page with flush_dcache_page respectively, hence need
to update the corresponding defines in the test.

  scatterlist.c: In function ‘sg_miter_stop’:
  scatterlist.c:919:4: warning: implicit declaration of function ‘flush_dcache_page’ [-Wimplicit-function-declaration]
      flush_dcache_page(miter->page);
      ^~~~~~~~~~~~~~~~~
  In file included from linux/scatterlist.h:8:0,
                   from scatterlist.c:9:
  scatterlist.c:922:18: warning: implicit declaration of function ‘pagefault_disabled’ [-Wimplicit-function-declaration]
      WARN_ON_ONCE(!pagefault_disabled());
                    ^
  linux/mm.h:23:25: note: in definition of macro ‘WARN_ON_ONCE’
    int __ret_warn_on = !!(condition);                      \
                           ^~~~~~~~~

Link: https://lkml.kernel.org/r/20220118082105.1737320-1-maorg@nvidia.com
Fixes: 723aca208516 ("mm/scatterlist: replace the !preemptible warning in sg_miter_stop()")
Fixes: 0e84f5dbf8d6 ("scatterlist: replace flush_kernel_dcache_page with flush_dcache_page")
Signed-off-by: Maor Gottlieb <maorg@nvidia.com>
Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/scatterlist/linux/mm.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/scatterlist/linux/mm.h b/tools/testing/scatterlist/linux/mm.h
index 16ec895bbe5f..5bd9e6e80625 100644
--- a/tools/testing/scatterlist/linux/mm.h
+++ b/tools/testing/scatterlist/linux/mm.h
@@ -74,7 +74,7 @@ static inline unsigned long page_to_phys(struct page *page)
 	      __UNIQUE_ID(min1_), __UNIQUE_ID(min2_),   \
 	      x, y)
 
-#define preemptible() (1)
+#define pagefault_disabled() (0)
 
 static inline void *kmap(struct page *page)
 {
@@ -127,6 +127,7 @@ kmalloc_array(unsigned int n, unsigned int size, unsigned int flags)
 #define kmemleak_free(a)
 
 #define PageSlab(p) (0)
+#define flush_dcache_page(p)
 
 #define MAX_ERRNO	4095
 
-- 
cgit v1.2.3